Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • chat
  • kru0052-master-patch-91081
  • lifecycle
  • master
  • 20180621-before_revision
  • 20180621-revision
6 results

Target

Select target project
No results found
Select Git revision
  • einfra
  • hot_fix
  • karolina-matrix
  • master
  • master-test
  • new-toc
  • orca-5-0-0
  • software
  • 20180621-before_revision
  • 20180621-revision
10 results
Show changes
192 files
+ 7070
2451
Compare changes
  • Side-by-side
  • Inline

Files

+11 −15
Original line number Original line Diff line number Diff line
@@ -12,7 +12,7 @@ docs:
  image: it4innovations/docker-mdcheck:latest
  image: it4innovations/docker-mdcheck:latest
  allow_failure: true
  allow_failure: true
  script:
  script:
  - mdl -r ~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046 *.md docs.it4i # BUGS
  - find content/docs -name "*.mdx" | xargs mdl -r ~MD002,~MD007,~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046


pylint:
pylint:
  stage: test
  stage: test
@@ -22,20 +22,16 @@ pylint:
  script:
  script:
  - pylint $(find . -name "*.py" -not -name "feslicescript.py")
  - pylint $(find . -name "*.py" -not -name "feslicescript.py")


pysafety:
capitalize:
  stage: test
  stage: test
  image: it4innovations/docker-pycheck:latest
  image: it4innovations/docker-mkdocscheck:latest
  allow_failure: true
  allow_failure: true
  before_script:
  before_script:
  - source /opt/.venv3/bin/activate
  - source /opt/.venv3/bin/activate
  - python -V # debug
  - pip list | grep titlecase 
  script:
  script:
  - cat requirements.txt | safety check --stdin --full-report
  - find content/docs/ \( -name '*.mdx' -o -name '*.yml' \) ! -path '*einfracz*' -print0 | xargs -0 -n1 scripts/titlemd.py --test

capitalize:
  stage: test
  image: it4innovations/docker-mkdocscheck:latest
  script:
  - find mkdocs.yml docs.it4i/ \( -name '*.md' -o -name '*.yml' \) -print0 | xargs -0 -n1 scripts/titlemd.py --test


ext_links:
ext_links:
  stage: after_test
  stage: after_test
@@ -45,7 +41,7 @@ ext_links:
  # remove JSON results
  # remove JSON results
  - rm *.json
  - rm *.json
  script:
  script:
  - find docs.it4i/ -name '*.md' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
  - find content/docs -name '*.mdx' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
  only:
  only:
  - master
  - master


@@ -55,8 +51,8 @@ ext_links:
  before_script:
  before_script:
  - echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
  - echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
  - wget -V
  - wget -V
  - echo https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
  - echo https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
  - wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/ || true
  - wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/ || true
  script:
  script:
  - cat wget.log | awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'
  - cat wget.log | awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'


@@ -75,7 +71,7 @@ mkdocs:
    # get modules list from clusters
    # get modules list from clusters
  - bash scripts/get_modules.sh
  - bash scripts/get_modules.sh
    # generate site_url
    # generate site_url
  - (if [ "${CI_BUILD_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_BUILD_REF_NAME\//" mkdocs.yml;fi);
  - (if [ "${CI_COMMIT_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_COMMIT_REF_NAME\//" mkdocs.yml;fi);
    # generate ULT for code link
    # generate ULT for code link
#  - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
#  - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
    # regenerate modules matrix
    # regenerate modules matrix
@@ -113,7 +109,7 @@ deploy to stage:
  - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
  - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
  script:
  script:
  - chown nginx:nginx site -R
  - chown nginx:nginx site -R
  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
  only:
  only:
  - branches@sccs/docs.it4i.cz
  - branches@sccs/docs.it4i.cz


+27 −0
Original line number Original line Diff line number Diff line
Quantum Scalar I6
JAN
LUMI
AI
CI/CD
AWS
CLI
FAQ
s3cmd
GUI
EESSI
hipBlas
hipBlas
hipSolver
hipSolver
LUMI
LUMI
@@ -822,3 +833,19 @@ e-INFRA CZ
DICE
DICE
qgpu
qgpu
qcpu
qcpu
it4i-portal-clients
it4icheckaccess
it4idedicatedtime
it4ifree
it4ifsusage
it4iuserfsusage
it4iprojectfsusage
it4imotd
e-INFRA
it4i-portal-clients
s3cmd
s5cmd
title:
e-INFRA CZ Cloud Ostrava
e-INFRA CZ Account
+3 −53
Original line number Original line Diff line number Diff line
# User Documentation
# IT4Inovations Documentation


This project contains IT4Innovations user documentation source.
This project contains IT4Innovations user documentation source.


## Development
## Migration


### Install
* [fumadocs](https://fumadocs.vercel.app/)

 No newline at end of file
```console
$ sudo apt install libpython-dev
$ virtualenv venv
$ source venv/bin/activate
$ pip install -r requirements.txt
```

### Package Upgrade With pip

```console
$ pip list -o
$ pip install --upgrade package
$ pip freeze | sed '/pkg-resources==/d' > requirements.txt
```

## Environments

* [https://docs.it4i.cz - master branch](https://docs.it4i.cz - master branch)
* [https://docs.it4i.cz/devel/$BRANCH_NAME](https://docs.it4i.cz/devel/$BRANCH_NAME) - maps the branches, available only with VPN access

## URLs

* [http://facelessuser.github.io/pymdown-extensions/](http://facelessuser.github.io/pymdown-extensions/)
* [http://squidfunk.github.io/mkdocs-material/](http://squidfunk.github.io/mkdocs-material/)

```
fair-share
InfiniBand
RedHat
CentOS
Mellanox
```

## Mathematical Formulae

### Formulas Are Made With:

* [https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/](https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/)
* [https://www.mathjax.org/](https://www.mathjax.org/)

You can add formula to page like this:

```
$$
MAX\_FAIRSHARE * ( 1 - \frac{usage_{Project}}{usage_{Total}} )
$$
```

To enable the MathJX on page you need to enable it by adding line ```---8<--- "mathjax.md"``` at the end of file.

docs.it4i/apiv2.md

0 → 100644
+203 −0
Original line number Original line Diff line number Diff line
# SCS API v2

## Info

- **OpenAPI:** 3.1.0
- **Title:** scs-api-2
- **Version:** 0.1.0
- **Server URL:** `https://scs.it4i.cz/api/v2`

## Paths

### `/dedicated-time`

**GET**

- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries, optionally filtered by cluster name or period preset
- **OperationId:** `dedicated_time_handler`

**Parameters:**

- `cluster` (query): Filter by cluster name; Available values: karolina, barbora, dgx *(optional)*
- `period` (query): Filter by time period preset; Available values: planned, active *(optional)*

**Responses:**

- `200`: List of dedicated time entries
- `400`: Failed to deserialize query, Invalid cluster, Invalid period
  Example:

  ```json
  {
    "message": "Invalid cluster: el_gordo"
  }
  ```
- `500`: Failed to retrieve dedicated time due to a server error
  Example:
  ```json
  {
    "message": "Failed to retreive dedicated time"
  }
  ```

### `/dedicated-time-calendar`

**GET**

- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries and generates a VCalendar response.
- **OperationId:** `dedicated_time_calendar`

**Responses:**

- `200`: Dedicated time VCalendar
  Example:

  ```
  BEGIN:VCALENDAR
  VERSION:2.0
  PRODID:-//SUTD Timetable Calendar//randName//EN
  CALSCALE:GREGORIAN
  BEGIN:VEVENT
  UID:1234@example.com
  DTSTAMP:20230101T000000Z
  DTSTART:20230101T000000Z
  DTEND:20230102T000000Z
  SUMMARY:Sample Dedicated Time - Cluster Outage
  DESCRIPTION:Sample Dedicated Time - Cluster Outage
  END:VEVENT
  END:VCALENDAR
  ```

- `500`: Failed to retrieve dedicated time calendar
  Example:

  ```json
  {
    "message": "Failed to retreive dedicated time calendar"
  }
  ```

### `/motd`

**GET**

- **Summary:** Get messages of the day
- **Description:** Retrieves messages of the day, optionally filtered by category
- **OperationId:** `motd`

**Parameters:**

- `category` (query): *(optional)*

**Responses:**

- `200`: List of motd entries
- `400`: Failed to deserialize query, Invalid motd category
- `500`: Failed to retrieve motd entries due to a server error
  Example:

  ```json
  {
    "message": "Failed to retrieve motd"
  }
  ```

## Components

### Schemas

#### DedicatedTime

```yaml
type: object
required:
  - updated_at
properties:
  cluster_type:
    type: [string, 'null']
  date_efficiency:
    type: [string, 'null']
    format: date-time
  date_expiration:
    type: [string, 'null']
    format: date-time
  updated_at:
    type: string
    format: date-time
```

#### Motd

```yaml
type: object
required:
  - id
  - author
  - category
  - created_at
  - updated_at
  - date_modification
  - title
  - message_body
  - systems
properties:
  id:
    type: integer
    format: int32
    examples: [1]
  author:
    type: string
    examples: [Admin]
  category:
    type: string
    examples: [public-service-announcement]
  created_at:
    type: string
    format: date-time
  updated_at:
    type: string
    format: date-time
  date_modification:
    type: string
    format: date-time
  date_efficiency:
    type: [string, 'null']
    format: date-time
  date_expiration:
    type: [string, 'null']
    format: date-time
  date_outage_efficiency:
    type: [string, 'null']
    format: date-time
  date_outage_expiration:
    type: [string, 'null']
    format: date-time
  title:
    type: string
    examples: [Important Update]
  message_body:
    type: string
    examples: [We are experiencing some service disruptions.]
  systems:
    type: array
    items:
      type: string
      examples: [Karolina]
```

#### MsgResponse

```yaml
type: object
description: |
  Common struct for DTO-less responses
  eg. ```200 {"message":"Operation succeeded"}```
required:
  - message
properties:
  message:
    type: string
    examples: [API response]
```
Original line number Original line Diff line number Diff line
# Hardware Overview

!!!important Work in progress
    Barbora NG documentation is a WIP.
    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

    The launch of Barbora NG is planned for October/November.
    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

Barbora NG consists of 141 non-accelerated compute nodes named **cn[001-141]**.
Each node is a powerful x86-64 computer equipped with 192 cores
(2x Intel Xeon 6952P with 96 CPU cores) and 768 GB RAM.
User access to the Barbora NG cluster is provided by two login nodes **login[1-2]**.
The nodes are interlinked through high speed InfiniBand NDR and Ethernet networks.

The parameters are summarized in the following tables:

| **In general**                       |                       |
| ------------------------------------ | --------------------- |
| Architecture of compute nodes        | x86-64                |
| Operating system                     | Linux                 |
| [**Compute nodes**][1]               |                       |
| Total                                | 141                   |
| Processor Type                       | [Intel Xeon 6952P][b] |
| Architecture                         | Granite Rapids        |
| Processor cores                      | 96                    |
| Processors per node                  | 2                     |
| RAM                                  | 768 GB                |
| Local disk drive                     | no                    |
| Compute network                      | InfiniBand HDR        |
| non-accelerated                      | 141, cn[001-141]        |
| **In total**                         |                       |
| Theoretical peak performance (Rpeak) | ??? TFLOP/s           |
| Cores                                | 27072                 |
| RAM                                  | 108.288 TB            |

[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
[3]: network.md
[4]: storage.md
[5]: ../general/shell-and-data-access.md
[6]: visualization.md

[a]: https://support.it4i.cz/rt
[b]: https://www.intel.com/content/www/us/en/products/sku/241643/intel-xeon-6952p-processor-480m-cache-2-10-ghz/specifications.html
 No newline at end of file
+36 −0
Original line number Original line Diff line number Diff line
# Introduction

!!!important Work in progress
    Barbora NG documentation is a WIP.
    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

    The launch of Barbora NG is planned for October/November.
    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

Welcome to Barbora Next Gen (NG) supercomputer cluster.
Barbora NG is our latest supercomputer which consists of 141 compute nodes,
totaling 27072 compute cores with 108288 GB RAM, giving over ??? TFLOP/s theoretical peak performance.

Nodes are interconnected through a fully non-blocking fat-tree InfiniBand NDR network
and are equipped with Intel Granite Rapids processors.
Read more in [Hardware Overview][1].

The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains.
These packages are accessible via the [modules environment][2].

The user data shared file system and job data shared file system are available to users.

The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].


[1]: hardware-overview.md
[2]: ../environment-and-modules.md
[3]: ../general/resources-allocation-policy.md
[4]: ../general/applying-for-resources.md
[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[6]: ../general/shell-and-data-access.md

[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://slurm.schedmd.com/
Original line number Original line Diff line number Diff line
@@ -8,7 +8,7 @@ The cluster runs with an operating system compatible with the Red Hat [Linux fam


The user data shared file system and job data shared file system are available to users.
The user data shared file system and job data shared file system are available to users.


The [PBS Professional Open Source Project][b] workload manager provides [computing resources allocations and job execution][3].
The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].


Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].
Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].


@@ -22,4 +22,4 @@ Read more on how to [apply for resources][4], [obtain login credentials][5] and
[6]: ../general/shell-and-data-access.md
[6]: ../general/shell-and-data-access.md


[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://www.pbspro.org/
[b]: https://slurm.schedmd.com/
Original line number Original line Diff line number Diff line
@@ -120,7 +120,7 @@ The filesystem is backed up, so that it can be restored in case of a catastrophi


The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.
The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.


The SCRATCH filesystem is mounted in directory /scratch. Users may freely create subdirectories and files on the filesystem. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.
The SCRATCH filesystem is mounted in the `/scratch/project/PROJECT_ID` directory created automatically with the `PROJECT_ID` project. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.


!!! note
!!! note
    The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.
    The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.
Original line number Original line Diff line number Diff line
# e-INFRA CZ Cloud Ostrava
# e-INFRA CZ Cloud Ostrava


Ostrava cloud consists of 28 nodes from [Karolina][a] supercomputer.
Ostrava cloud consists of 22 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.
which is a free open standard cloud computing platform.


@@ -61,15 +61,15 @@ For the list of deployed OpenStack services, see the [list of components][1].


More information can be found on the [e-INFRA CZ website][2].
More information can be found on the [e-INFRA CZ website][2].


[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
[1]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/openstack-components
[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
[2]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site
[3]: https://docs.e-infra.cz/account/
[3]: https://docs.account.e-infra.cz/en/docs/access/account#how-to-apply-for-the-first-time
[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
[4]: https://docs.platforms.cloud.e-infra.cz/en/docs/getting-started/creating-first-infrastructure
[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/quota-limits/
[5]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/quota-limits
[6]: https://ostrava.openstack.cloud.e-infra.cz/
[6]: https://ostrava.openstack.cloud.e-infra.cz/
[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
[7]: https://cyso.cloud/docs/cloud/extra/how-to-use-the-openstack-cli-tools-on-linux/
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
[9]: https://docs.platforms.cloud.e-infra.cz/en/docs/how-to-guides/obtaining-api-key
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline


[a]: ../karolina/introduction.md
[a]: ../karolina/introduction.md
+143 −0
Original line number Original line Diff line number Diff line
# IT4I Cloud

IT4I cloud consists of 14 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.

!!! Note
    The guide describes steps for personal projects.<br>
    Some steps may differ for large projects.<br>
    For large project, apply for resources to the [Allocation Committee][11].

## Access

To access the cloud you must be a member of an active EUROHPC project,
or fall into the **Access Category B**, i.e. [Access For Thematic HPC Resource Utilisation][11].

A personal OpenStack project is required. Request one by contacting [IT4I Support][12].

The dashboard is available at [https://cloud.it4i.cz][6].

You can see quotas set for the IT4I Cloud in the [Quota Limits][f] section.

## Creating First Instance

To create your first VM instance, follow the steps below:

### Log In

Go to [https://cloud.it4i.cz][6], enter your LDAP username and password and choose the `IT4I_LDAP` domain. After you sign in, you will be redirected to the dashboard.

![](../img/login.png)

### Create Key Pair

SSH key is required for remote access to your instance.

1. Go to **Project > Compute > Key Pairs** and click the **Create Key Pair** button.

    ![](../img/keypairs.png)

1. In the Create Key Pair window, name your key pair, select `SSH Key` for key type and confirm by clicking Create Key Pair.

    ![](../img/keypairs1.png)

1. Download and manage the private key according to your operating system.

### Update Security Group

To be able to remotely access your VM instance, you have to allow access in the security group.

1. Go to **Project > Network > Security Groups** and click on **Manage Rules**, for the default security group.

    ![](../img/securityg.png)

1. Click on **Add Rule**, choose **SSH**, and leave the remaining fields unchanged.

    ![](../img/securityg1.png)

### Create VM Instance

1. In **Compute > Instances**, click **Launch Instance**.

    ![](../img/instance.png)

1. Choose Instance Name, Description, and number of instances. Click **Next**.

    ![](../img/instance1.png)

1. Choose an image from which to boot the instance. Choose to delete the volume after instance delete. Click **Next**.

    ![](../img/instance2.png)

1. Choose the hardware resources of the instance by selecting a flavor. Additional volumes for data can be attached later on. Click **Next**.

    ![](../img/instance3.png)

1. Select the network and continue to **Security Groups**.

    ![](../img/instance4.png)

1. Allocate the security group with SSH rule that you added in the [Update Security Group](it4i-cloud.md#update-security-group) step. Then click **Next** to go to the **Key Pair**.

    ![](../img/securityg2.png)

1. Select the key that you created in the [Create Key Pair][g] section and launch the instance.

    ![](../img/instance5.png)

### Associate Floating IP

1. Click on the **Associate** button next to the floating IP.

    ![](../img/floatingip.png)

1. Select Port to be associated with the instance, then click the **Associate** button.

Now you can join the VM using your preferred SSH client.

## Process Automatization

You can automate the process using Openstack.

### OpenStack

Prerequisites:

* Linux/Mac/WSL terminal BASH shell
* installed [OpenStack client][7]

Follow the guide: [https://code.it4i.cz/commandline][10]

Run commands:

```console
source project_openrc.sh.inc
```

```console
./cmdline-demo.sh basic-infrastructure-1
```

[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
[3]: https://docs.e-infra.cz/account/
[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-g2-site/quota-limits/
[6]: https://cloud.it4i.cz
[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
[11]: https://www.it4i.cz/en/for-users/computing-resources-allocation
[12]: mailto:support@it4i.cz @@

[a]: ../karolina/introduction.md
[b]: ../general/access/project-access.md
[c]: einfracz-cloud.md
[d]: ../general/accessing-the-clusters/vpn-access.md
[e]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[f]: it4i-quotas.md
[g]: it4i-cloud.md#create-key-pair

+31 −0
Original line number Original line Diff line number Diff line
# IT4I Cloud Quotas

| Resource                              | Quota |
|---------------------------------------|-------|
| Instances                             |    10 |
| VCPUs                                 |    20 |
| RAM                                   |  32GB |
| Volumes                               |    20 |
| Volume Snapshots                      |    12 |
| Volume Storage                        |   500 |
| Floating-IPs                          |     1 |
| Security Groups                       |    10 |
| Security Group Rules                  |   100 |
| Networks                              |     1 |
| Ports                                 |    10 |
| Routers                               |     1 |
| Backups                               |    12 |
| Groups                                |    10 |
| rbac_policies                         |    10 |
| Subnets                               |     1 |
| Subnet_pools                          |    -1 |
| Fixed-ips                             |    -1 |
| Injected-file-size                    | 10240 |
| Injected-path-size                    |   255 |
| Injected-files                        |     5 |
| Key-pairs                             |   100 |
| Properties                            |   128 |
| Server-groups                         |    10 |
| Server-group-members                  |    10 |
| Backup-gigabytes                      |  1002 |
| Per-volume-gigabytes                  |    -1 |
+301 −0
Original line number Original line Diff line number Diff line
# Using NVIDIA Grace Partition

For testing your application on the NVIDIA Grace Partition,
you need to prepare a job script for that partition or use the interactive job:

```console
salloc -N 1 -c 144 -A PROJECT-ID -p p11-grace --time=08:00:00
```

where:

- `-N 1` means allocation single node,
- `-c 144` means allocation 144 cores,
- `-p p11-grace` is NVIDIA Grace partition,
- `--time=08:00:00` means allocation for 8 hours.

## Available Toolchains

The platform offers three toolchains:

- Standard GCC (as a module `ml GCC`)
- [NVHPC](https://developer.nvidia.com/hpc-sdk) (as a module `ml NVHPC`)
- [Clang for NVIDIA Grace](https://developer.nvidia.com/grace/clang) (installed in `/opt/nvidia/clang`)

!!! note
    The NVHPC toolchain showed strong results with minimal amount of tuning necessary in our initial evaluation.

### GCC Toolchain

The GCC compiler seems to struggle with vectorization of short (constant length) loops, which tend to get completely unrolled/eliminated instead of being vectorized. For example simple nested loop such as

```cpp
for(int i = 0; i < 1000000; ++i) {
    // Iterations dependent in "i"
    // ...
    for(int j = 0; j < 8; ++j) {
        // but independent in "j"
        // ...
    }
}
```

may emit scalar code for the inner loop leading to no vectorization being used at all.

### Clang (For Grace) Toolchain

The Clang/LLVM tends to behave similarly, but can be guided to properly vectorize the inner loop with either flags `-O3 -ffast-math -march=native -fno-unroll-loops -mllvm -force-vector-width=8` or pragmas such as `#pragma clang loop vectorize_width(8)` and `#pragma clang loop unroll(disable)`.

```cpp
for(int i = 0; i < 1000000; ++i) {
    // Iterations dependent in "i"
    // ...
    #pragma clang loop unroll(disable) vectorize_width(8)
    for(int j = 0; j < 8; ++j) {
        // but independent in "j"
        // ...
    }
}
```

!!! note
    Our basic experiments show that fixed width vectorization (NEON) tends to perform better in the case of short (register-length) loops than SVE. In cases (like above), where specified `vectorize_width` is larger than availiable vector unit width, Clang will emit multiple NEON instructions (eg. 4 instructions will be emitted to process 8 64-bit operations in 128-bit units of Grace).

### NVHPC Toolchain

The NVHPC toolchain handled aforementioned case without any additional tuning. Simple `-O3 -march=native -fast` should be therefore sufficient.

## Basic Math Libraries

The basic libraries (BLAS and LAPACK) are included in NVHPC toolchain and can be used simply as `-lblas` and `-llapack` for BLAS and LAPACK respectively (`lp64` and `ilp64` versions are also included).

!!! note
    The Grace platform doesn't include CUDA-capable GPU, therefore `nvcc` will fail with an error. This means that `nvc`, `nvc++` and `nvfortran` should be used instead.

### NVIDIA Performance Libraries

The [NVPL](https://developer.nvidia.com/nvpl) package includes more extensive set of libraries in both sequential and multi-threaded versions:

- BLACS: `-lnvpl_blacs_{lp64,ilp64}_{mpich,openmpi3,openmpi4,openmpi5}`
- BLAS: `-lnvpl_blas_{lp64,ilp64}_{seq,gomp}`
- FFTW: `-lnvpl_fftw`
- LAPACK: `-lnvpl_lapack_{lp64,ilp64}_{seq,gomp}`
- ScaLAPACK: `-lnvpl_scalapack_{lp64,ilp64}`
- RAND: `-lnvpl_rand` or `-lnvpl_rand_mt`
- SPARSE: `-lnvpl_sparse`

This package should be compatible with all availiable toolchains and includes CMake module files for easy integration into CMake-based projects. For further documentation see also [NVPL](https://docs.nvidia.com/nvpl).

### Recommended BLAS Library

We recommend to use the multi-threaded BLAS library from the NVPL package.

!!! note
    It is important to pin the processes using **OMP_PROC_BIND=spread**

Example:

```console
$ ml NVHPC
$ nvc -O3 -march=native myprog.c -o myprog -lnvpl_blas_lp64_gomp
$ OMP_PROC_BIND=spread ./myprog
```

## Basic Communication Libraries

The OpenMPI 4 implementation is included with NVHPC toolchain and is exposed as a module (`ml OpenMPI`). The following example

```cpp
#include <mpi.h>
#include <sched.h>
#include <omp.h>

int main(int argc, char **argv)
{
        int rank;
        MPI_Init(&argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        #pragma omp parallel
        {
                printf("Hello on rank %d, thread %d on CPU %d\n", rank, omp_get_thread_num(), sched_getcpu());
        }
        MPI_Finalize();
}
```

can be compiled and run as follows

```console
ml OpenMPI
mpic++ -fast -fopenmp hello.cpp -o hello
OMP_PROC_BIND=close OMP_NUM_THREADS=4 mpirun -np 4 --map-by slot:pe=36 ./hello
```

In this configuration we run 4 ranks bound to one quarter of cores each with 4 OpenMP threads.

## Simple BLAS Application

The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

Stationary probability vector estimation in `C++`:

```cpp
#include <iostream>
#include <vector>
#include <chrono>
#include "cblas.h"

const size_t ITERATIONS  = 32;
const size_t MATRIX_SIZE = 1024;

int main(int argc, char *argv[])
{
    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
    a[0] = 0.5f;

    std::vector<float> w1(matrixElements, 0.0f);
    std::vector<float> w2(matrixElements, 0.0f);

    std::copy(a.begin(), a.end(), w1.begin());

    std::vector<float> *t1, *t2;
    t1 = &w1;
    t2 = &w2;

    auto c1 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < ITERATIONS; ++i)
    {
        std::fill(t2->begin(), t2->end(), 0.0f);

        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                    1.0f, t1->data(), MATRIX_SIZE,
                    a.data(), MATRIX_SIZE,
                    1.0f, t2->data(), MATRIX_SIZE);

        std::swap(t1, t2);
    }

    auto c2 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
    {
        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
    }

    std::cout << std::endl;

    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    return 0;
}
```

Stationary probability vector estimation in `Fortran`:

```fortran
program main
    implicit none

    integer :: matrix_size, iterations
    integer :: i
    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
    real, pointer :: out_data(:), out_diag(:)
    integer :: cr, cm, c1, c2

    iterations  = 32
    matrix_size = 1024

    call system_clock(count_rate=cr)
    call system_clock(count_max=cm)

    allocate(a(matrix_size, matrix_size))
    allocate(w1(matrix_size, matrix_size))
    allocate(w2(matrix_size, matrix_size))

    a(:,:) = 1.0 / real(matrix_size)
    a(:,1) = 0.5 / real(matrix_size - 1)
    a(1,1) = 0.5

    w1 = a
    w2(:,:) = 0.0

    t1 => w1
    t2 => w2

    call system_clock(c1)

    do i = 0, iterations
        t2(:,:) = 0.0

        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

        tmp => t1
        t1  => t2
        t2  => tmp
    end do

    call system_clock(c2)

    out_data(1:size(t1)) => t1
    out_diag => out_data(1::matrix_size+1)

    print *, out_diag
    print *, "Elapsed Time: ", (c2 - c1) / real(cr)

    deallocate(a)
    deallocate(w1)
    deallocate(w2)
end program main
```

### Using NVHPC Toolchain

The C++ version of the example can be compiled with NVHPC and ran as follows

```console
ml NVHPC
nvc++ -O3 -march=native -fast -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lblas main.cpp -o main
OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
```

The Fortran version is just as simple:

```console
ml NVHPC
nvfortran -O3 -march=native -fast -lblas main.f90 -o main.x
OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
```

!!! note
    It may be advantageous to use NVPL libraries instead NVHPC ones. For example DGEMM BLAS 3 routine from NVPL is almost 30% faster than NVHPC one.

### Using Clang (For Grace) Toolchain

Similarly Clang for Grace toolchain with NVPL BLAS can be used to compile C++ version of the example.

```console
ml NVHPC
/opt/nvidia/clang/17.23.11/bin/clang++ -O3 -march=native -ffast-math -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lnvpl_blas_lp64_gomp main.cpp -o main
```

!!! note
    NVHPC module is used just for the `cblas.h` include in this case. This can be avoided by changing the code to use `nvpl_blas.h` instead.

## Additional Resources

- [https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/][1]
- [https://developer.nvidia.com/hpc-sdk][2]
- [https://developer.nvidia.com/grace/clang][3]
- [https://docs.nvidia.com/nvpl][4]

[1]: https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/
[2]: https://developer.nvidia.com/hpc-sdk
[3]: https://developer.nvidia.com/grace/clang
[4]: https://docs.nvidia.com/nvpl
+279 −0
Original line number Original line Diff line number Diff line
# Heterogeneous Memory Management on Intel Platforms

Partition `p10-intel` offser heterogeneous memory directly exposed to the user. This allows to manually pick appropriate kind of memory to be used at process or even single allocation granularity. Both kinds of memory are exposed as memory-only NUMA nodes. This allows both coarse (process level) and fine (allocation level) grained control over memory type used.

## Overview

At the process level the `numactl` facilities can be utilized, while Intel provided `memkind` library allows for finer control. Both `memkind` library and `numactl` can be accessed by loading `memkind` module or `OpenMPI` module (only `numactl`).

```bash
ml memkind
```

### Process Level (NUMACTL)

The `numactl` allows to either restrict memory pool of the process to specific set of memory NUMA nodes

```bash
numactl --membind <node_ids_set>
```

or select single preffered node

```bash
numactl --preffered <node_id>
```

where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation.

Convenient way to check `numactl` configuration is

```bash
numactl -s
```

which prints configuration in its execution environment eg.

```bash
numactl --membind 8-15 numactl -s
policy: bind
preferred node: 0
physcpubind: 0 1 2 ... 189 190 191
cpubind: 0 1 2 3 4 5 6 7
nodebind: 0 1 2 3 4 5 6 7
membind: 8 9 10 11 12 13 14 15
```

The last row shows allocations memory are restricted to NUMA nodes `8-15`.

### Allocation Level (MEMKIND)

The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing:

```cpp
void *pData = malloc(<SIZE>);
/* ... */
free(pData);
```

with

```cpp
#include <memkind.h>

void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>);
/* ... */
memkind_free(NULL, pData); // "kind" parameter is deduced from the address
```

Similarly other memory types can be chosen.

!!! note
    The allocation will return `NULL` pointer when memory of specified kind is not available.

## High Bandwidth Memory (HBM)

Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running

```bash
numactl -H
```

which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).

![](../../img/cs/guides/p10_numa_sc4_flat.png)

### Process Level

With this we can easily restrict application to DDR DRAM or HBM memory:

```bash
# Only DDR DRAM
numactl --membind 0-7 ./stream
# ...
Function    Best Rate MB/s  Avg time     Min time     Max time
Copy:          369745.8     0.043355     0.043273     0.043588
Scale:         366989.8     0.043869     0.043598     0.045355
Add:           378054.0     0.063652     0.063483     0.063899
Triad:         377852.5     0.063621     0.063517     0.063884

# Only HBM
numactl --membind 8-15 ./stream
# ...
Function    Best Rate MB/s  Avg time     Min time     Max time
Copy:         1128430.1     0.015214     0.014179     0.015615
Scale:        1045065.2     0.015814     0.015310     0.016309
Add:          1096992.2     0.022619     0.021878     0.024182
Triad:        1065152.4     0.023449     0.022532     0.024559
```

The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar.

Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like

```bash
#!/bin/bash
numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@
```

and can be used as

```bash
mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi
```

(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise

```bash
mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi
```

is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:
![](../../img/cs/guides/p10_stream_dram.png)
![](../../img/cs/guides/p10_stream_hbm.png)

### Allocation Level

Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows

```cpp
#include <memkind.h>
// ...
STREAM_TYPE *a = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
STREAM_TYPE *b = (STREAM_TYPE *)memkind_malloc(MEMKIND_REGULAR, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
STREAM_TYPE *c = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
// ...
memkind_free(NULL, a);
memkind_free(NULL, b);
memkind_free(NULL, c);
```

Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B.
The code then has to be linked with `memkind` library

```bash
gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream
```

and can be run as

```bash
export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream
```

While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.

![](../../img/cs/guides/p10_stream_memkind.png)

With this setup we can see that simple copy operation (C[i] = A[i]) achieves bandwidth comparable to the application bound entirely to HBM memory. On the other hand the scale operation (B[i] = s*C[i]) is mostly limited by DDR DRAM bandwidth. Its also worth noting that operations combining all three arrays are performing close to HBM-only configuration.

## Simple Application

One of applications that can greatly benefit from availability of large slower and faster smaller memory is computing histogram with many bins over large dataset.

```cpp
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cstring>
#include <omp.h>
#include <memkind.h>

const size_t N_DATA_SIZE  = 2 * 1024 * 1024 * 1024ull;
const size_t N_BINS_COUNT = 1 * 1024 * 1024ull;
const size_t N_ITERS      = 10;

#if defined(HBM)
    #define DATA_MEMKIND MEMKIND_REGULAR
    #define BINS_MEMKIND MEMKIND_HBW_ALL
#else
    #define DATA_MEMKIND MEMKIND_REGULAR
    #define BINS_MEMKIND MEMKIND_REGULAR
#endif

int main(int argc, char *argv[])
{
    const double binWidth = 1.0 / double(N_BINS_COUNT + 1);

    double *pData = (double *)memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double));
    size_t *pBins = (size_t *)memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double));

    #pragma omp parallel
    {
        drand48_data state;
        srand48_r(omp_get_thread_num(), &state);

        #pragma omp for
        for(size_t i = 0; i < N_DATA_SIZE; ++i)
            drand48_r(&state, &pData[i]);
    }

    auto c1 = std::chrono::steady_clock::now();

    for(size_t it = 0; it < N_ITERS; ++it)
    {
        #pragma omp parallel
        {
            for(size_t i = 0; i < N_BINS_COUNT; ++i)
                pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0);

            #pragma omp for
            for(size_t i = 0; i < N_DATA_SIZE; ++i)
            {
                const size_t idx = size_t(pData[i] / binWidth) % N_BINS_COUNT;
                pBins[omp_get_thread_num()*N_BINS_COUNT + idx]++;
            }
        }
    }

    auto c2 = std::chrono::steady_clock::now();

    #pragma omp parallel for
    for(size_t i = 0; i < N_BINS_COUNT; ++i)
    {
        for(size_t j = 1; j < omp_get_max_threads(); ++j)
            pBins[i] += pBins[j*N_BINS_COUNT + i];
    }

    std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    size_t total = 0;
    #pragma omp parallel for reduction(+:total)
    for(size_t i = 0; i < N_BINS_COUNT; ++i)
        total += pBins[i];

    std::cout << "Total Items: " << total << std::endl;

    memkind_free(NULL, pData);
    memkind_free(NULL, pBins);

    return 0;
}
```

### Using HBM Memory (P10-Intel)

Following commands can be used to compile and run example application above

```bash
ml GCC memkind
export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
g++ -O3 -fopenmp -lmemkind histogram.cpp -o histogram_dram
g++ -O3 -fopenmp -lmemkind -DHBM histogram.cpp -o histogram_hbm
OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_dram
OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_hbm
```

Moving histogram bins data into HBM memory should speedup the algorithm more than twice. It should be noted that moving also `pData` array into HBM memory worsens this result (presumably because the algorithm can saturate both memory interfaces).

## Additional Resources

- [https://linux.die.net/man/8/numactl][1]
- [http://memkind.github.io/memkind/man_pages/memkind.html][2]
- [https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory][3]

[1]: https://linux.die.net/man/8/numactl
[2]: http://memkind.github.io/memkind/man_pages/memkind.html
[3]: https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory
 No newline at end of file
+79 −0
Original line number Original line Diff line number Diff line
# Using VMware Horizon

VMware Horizon is a virtual desktop infrastructure (VDI) solution
that enables users to access virtual desktops and applications from any device and any location.
It provides a comprehensive end-to-end solution for managing and delivering virtual desktops and applications,
including features such as session management, user authentication, and virtual desktop provisioning.

![](../../img/horizon.png)

## How to Access VMware Horizon

!!! important
    Access to VMware Horizon requires IT4I VPN.

1. Contact [IT4I support][a] with a request for an access and VM allocation.
1. [Download][1] and install the VMware Horizon Client for Windows.
1. Add a new server `https://vdi-cs01.msad.it4i.cz/` in the Horizon client.
1. Connect to the server using your IT4I username and password.
   Username is in the `domain\username` format and the domain is `msad.it4i.cz`.
   For example: `msad.it4i.cz\user123`

## Example

Below is an example of how to mount a remote folder and check the conection on Windows OS:

### Prerequsities

3D applications

* [Blender][3]

SSHFS for remote access

* [sshfs-win][4]
* [winfsp][5]
* [shfs-win-manager][6]
* ssh keys for access to clusters

### Steps

1. Start the VPN and connect to the server via VMware Horizon Client.

    ![](../../img/vmware.png)

1. Mount a remote folder.
    * Run sshfs-win-manager.

    ![](../../img/sshfs.png)

    * Add a new connection.

    ![](../../img/sshfs1.png)

    * Click on **Connect**.

    ![](../../img/sshfs2.png)

1. Check that the folder is mounted.

    ![](../../img/mount.png)

1. Check the GPU resources.

    ![](../../img/gpu.png)

### Blender

Now if you run, for example, Blender, you can check the available GPU resources in Blender Preferences.

  ![](../../img/blender.png)

[a]: mailto:support@it4i.cz

[1]: https://vdi-cs01.msad.it4i.cz/
[2]: https://www.paraview.org/download/
[3]: https://www.blender.org/download/
[4]: https://github.com/winfsp/sshfs-win/releases
[5]: https://github.com/winfsp/winfsp/releases/
[6]: https://github.com/evsar3/sshfs-win-manager/releases
+227 −0
Original line number Original line Diff line number Diff line
# Using IBM Power Partition

For testing your application on the IBM Power partition,
you need to prepare a job script for that partition or use the interactive job:

```console
scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00
```

where:

- `-N 1` means allocation single node,
- `-c 192` means allocation 192 cores (threads),
- `-p p07-power` is IBM Power partition,
- `--time=08:00:00` means allocation for 8 hours.

On the partition, you should reload the list of modules:

```
ml architecture/ppc64le
```

The platform offers both `GNU` based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library ([ESSL](https://www.ibm.com/docs/en/essl/6.1)), which can be used by both toolchain.

## Building Applications

Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use):

```
ml GCC OpenBLAS
```

### GCC Toolchain

In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++`

```
g++ -lopenblas hello.cpp -o hello
```

or `gfortran`

```
gfortran -lopenblas hello.f90 -o hello
```

as usual.

### IBM Toolchain

The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module

```
IBM_ROOT=/opt/ibm
OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1

export PATH=$OPENXLC_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH

export PATH=$OPENXLF_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
```

from there we can use either `ibm-clang++`

```
ibm-clang++ -lopenblas hello.cpp -o hello
```

or `xlf`

```
xlf -lopenblas hello.f90 -o hello
```

to build the application as usual.

!!! note
    Combination of `xlf` and `openblas` seems to cause severe performance degradation. Therefore `ESSL` library should be preferred (see below).

### Using ESSL Library

The [ESSL](https://www.ibm.com/docs/en/essl/6.1) library is installed in `/opt/ibm/math/essl/7.1` so we define additional environment variables

```
IBM_ROOT=/opt/ibm
ESSL_ROOT=${IBM_ROOT}math/essl/7.1
export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
```

The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines).
In practice this can look like

```
g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
```

or

```
gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
```

and similarly for IBM compilers (`ibm-clang++` and `xlf`).

## Hello World Applications

The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

Stationary probability vector estimation in `C++`:

```c++
#include <iostream>
#include <vector>
#include <chrono>
#include "cblas.h"

const size_t ITERATIONS  = 32;
const size_t MATRIX_SIZE = 1024;

int main(int argc, char *argv[])
{
    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
    a[0] = 0.5f;

    std::vector<float> w1(matrixElements, 0.0f);
    std::vector<float> w2(matrixElements, 0.0f);

    std::copy(a.begin(), a.end(), w1.begin());

    std::vector<float> *t1, *t2;
    t1 = &w1;
    t2 = &w2;

    auto c1 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < ITERATIONS; ++i)
    {
        std::fill(t2->begin(), t2->end(), 0.0f);

        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                    1.0f, t1->data(), MATRIX_SIZE,
                    a.data(), MATRIX_SIZE,
                    1.0f, t2->data(), MATRIX_SIZE);

        std::swap(t1, t2);
    }

    auto c2 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
    {
        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
    }

    std::cout << std::endl;

    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    return 0;
}
```

Stationary probability vector estimation in `Fortran`:

```fortran
program main
    implicit none

    integer :: matrix_size, iterations
    integer :: i
    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
    real, pointer :: out_data(:), out_diag(:)
    integer :: cr, cm, c1, c2

    iterations  = 32
    matrix_size = 1024

    call system_clock(count_rate=cr)
    call system_clock(count_max=cm)

    allocate(a(matrix_size, matrix_size))
    allocate(w1(matrix_size, matrix_size))
    allocate(w2(matrix_size, matrix_size))

    a(:,:) = 1.0 / real(matrix_size)
    a(:,1) = 0.5 / real(matrix_size - 1)
    a(1,1) = 0.5

    w1 = a
    w2(:,:) = 0.0

    t1 => w1
    t2 => w2

    call system_clock(c1)

    do i = 0, iterations
        t2(:,:) = 0.0

        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

        tmp => t1
        t1  => t2
        t2  => tmp
    end do

    call system_clock(c2)

    out_data(1:size(t1)) => t1
    out_diag => out_data(1::matrix_size+1)

    print *, out_diag
    print *, "Elapsed Time: ", (c2 - c1) / real(cr)

    deallocate(a)
    deallocate(w1)
    deallocate(w2)
end program main
```
Original line number Original line Diff line number Diff line
# Complementary Systems
# Introduction


Complementary systems offer development environment for users
Complementary systems offer development environment for users
that need to port and optimize their code and applications
that need to port and optimize their code and applications
@@ -26,6 +26,8 @@ Second stage of complementary systems implementation comprises of these partitio
- compute partition 7 - based on IBM Power10 architecture
- compute partition 7 - based on IBM Power10 architecture
- compute partition 8 - modern CPU with a very high L3 cache capacity (over 750MB)
- compute partition 8 - modern CPU with a very high L3 cache capacity (over 750MB)
- compute partition 9 - virtual GPU accelerated workstations
- compute partition 9 - virtual GPU accelerated workstations
- compute partition 10 - Sapphire Rapids-HBM server
- compute partition 11 - NVIDIA Grace CPU Superchip


![](../img/cs2_2.png)
![](../img/cs2_2.png)


Original line number Original line Diff line number Diff line
@@ -20,6 +20,7 @@ p05-synt up 1-00:00:00 0/1/0/1 p05-synt01
p06-arm      up 1-00:00:00          0/2/0/2 p06-arm[01-02]
p06-arm      up 1-00:00:00          0/2/0/2 p06-arm[01-02]
p07-power    up 1-00:00:00          0/1/0/1 p07-power01
p07-power    up 1-00:00:00          0/1/0/1 p07-power01
p08-amd      up 1-00:00:00          0/1/0/1 p08-amd01
p08-amd      up 1-00:00:00          0/1/0/1 p08-amd01
p10-intel    up 1-00:00:00          0/1/0/1 p10-intel01
```
```


## Getting Job Information
## Getting Job Information
@@ -89,7 +90,7 @@ set | grep ^SLURM


| variable name | description | example |
| variable name | description | example |
| ------ | ------ | ------ |
| ------ | ------ | ------ |
| SLURM_JOBID | job id of the executing job| 593 |
| SLURM_JOB_ID | job id of the executing job| 593 |
| SLURM_JOB_NODELIST | nodes allocated to the job | p03-amd[01-02] |
| SLURM_JOB_NODELIST | nodes allocated to the job | p03-amd[01-02] |
| SLURM_JOB_NUM_NODES | number of nodes allocated to the job | 2 |
| SLURM_JOB_NUM_NODES | number of nodes allocated to the job | 2 |
| SLURM_STEP_NODELIST | nodes allocated to the job step | p03-amd01 |
| SLURM_STEP_NODELIST | nodes allocated to the job step | p03-amd01 |
@@ -145,6 +146,7 @@ $ scancel JOBID
| p06-arm   | 2     | yes        | 80             | aarch64,ib |
| p06-arm   | 2     | yes        | 80             | aarch64,ib |
| p07-power | 1     | yes        | 192            | ppc64le,ib |
| p07-power | 1     | yes        | 192            | ppc64le,ib |
| p08-amd   | 1     | yes        | 128            | x86_64,amd,milan-x,ib,ht |
| p08-amd   | 1     | yes        | 128            | x86_64,amd,milan-x,ib,ht |
| p10-intel | 1     | yes        | 96             | x86_64,intel,sapphire_rapids,ht|


Use `-t`, `--time` option to specify job run time limit. Default job time limit is 2 hours, maximum job time limit is 24 hours.
Use `-t`, `--time` option to specify job run time limit. Default job time limit is 2 hours, maximum job time limit is 24 hours.


@@ -312,6 +314,14 @@ Whole node allocation:
salloc -A PROJECT-ID -p p08-amd
salloc -A PROJECT-ID -p p08-amd
```
```


## Partition 10 - Intel Sapphire Rapids

Whole node allocation:

```console
salloc -A PROJECT-ID -p p10-intel
```

## Features
## Features


Nodes have feature tags assigned to them.
Nodes have feature tags assigned to them.
@@ -326,6 +336,7 @@ Users can select nodes based on the feature tags using --constraint option.
| intel | manufacturer |
| intel | manufacturer |
| icelake | processor family |
| icelake | processor family |
| broadwell | processor family |
| broadwell | processor family |
| sapphire_rapids | processor family |
| milan | processor family |
| milan | processor family |
| milan-x | processor family |
| milan-x | processor family |
| ib | Infiniband |
| ib | Infiniband |
@@ -342,10 +353,14 @@ p00-arm01 aarch64,cortex-a72
p01-arm[01-08]   aarch64,a64fx,ib
p01-arm[01-08]   aarch64,a64fx,ib
p02-intel01      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,ht
p02-intel01      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,ht
p02-intel02      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
p02-intel02      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
p03-amd01        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
p03-amd02        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,noht
p03-amd02        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,noht
p03-amd01        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
p04-edge01       x86_64,intel,broadwell,ib,ht
p04-edge01       x86_64,intel,broadwell,ib,ht
p05-synt01       x86_64,amd,milan,ib,ht
p05-synt01       x86_64,amd,milan,ib,ht
p06-arm[01-02]   aarch64,ib
p07-power01      ppc64le,ib
p08-amd01        x86_64,amd,milan-x,ib,ht
p10-intel01      x86_64,intel,sapphire_rapids,ht
```
```


```
```
Original line number Original line Diff line number Diff line
@@ -199,7 +199,38 @@ The following is the list of software available on partiton 09:
- 40x Windows 10/11 Enterprise E3 VDA (Microsoft) per year
- 40x Windows 10/11 Enterprise E3 VDA (Microsoft) per year
- Hardware VMware Horizon management
- Hardware VMware Horizon management


## Partition 10 - Sapphire Rapids-HBM Server

The primary purpose of this server is to evaluate the impact of the HBM memory on the x86 processor
on the performance of the user applications.
This is a new feature previously available only on the GPGPU accelerators
and provided a significant boost to the memory-bound applications.
Users can also compare the impact of the HBM memory with the impact of the large L3 cache
available on the AMD Milan-X processor also available on the complementary systems.
The server is also equipped with DDR5 memory and enables the comparative studies with reference to DDR4 based systems.

- 2x Intel® Xeon® CPU Max 9468 48 cores base 2.1GHz, max 3.5Ghz
- 16x 16GB DDR5 4800Mhz
- 2x Intel D3 S4520 960GB SATA 6Gb/s
- 1x Supermicro Standard LP 2-port 10GbE RJ45, Broadcom BCM57416

## Partition 11 - NVIDIA Grace CPU Superchip

The [NVIDIA Grace CPU Superchip][6] uses the [NVIDIA® NVLink®-C2C][5] technology to deliver 144 Arm® Neoverse V2 cores and 1TB/s of memory bandwidth.
Runs all NVIDIA software stacks and platforms, including NVIDIA RTX™, NVIDIA HPC SDK, NVIDIA AI, and NVIDIA Omniverse™.

- Superchip design with up to 144 Arm Neoverse V2 CPU cores with Scalable Vector Extensions (SVE2)
- World’s first LPDDR5X with error-correcting code (ECC) memory, 1TB/s total bandwidth
- 900GB/s coherent interface, 7X faster than PCIe Gen 5
- NVIDIA Scalable Coherency Fabric with 3.2TB/s of aggregate bisectional bandwidth
- 2X the packaging density of DIMM-based solutions
- 2X the performance per watt of today’s leading CPU
- FP64 Peak of 7.1TFLOPS

[1]: https://www.bittware.com/fpga/520n-mx/
[1]: https://www.bittware.com/fpga/520n-mx/
[2]: https://www.xilinx.com/products/boards-and-kits/alveo/u250.html#overview
[2]: https://www.xilinx.com/products/boards-and-kits/alveo/u250.html#overview
[3]: https://www.xilinx.com/products/boards-and-kits/alveo/u280.html#overview
[3]: https://www.xilinx.com/products/boards-and-kits/alveo/u280.html#overview
[4]: https://developer.arm.com/documentation/100095/0003/
[4]: https://developer.arm.com/documentation/100095/0003/
[5]: https://www.nvidia.com/en-us/data-center/nvlink-c2c/
[6]: https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/
Original line number Original line Diff line number Diff line
@@ -7,7 +7,8 @@


## How to Access
## How to Access


The DGX-2 machine can be accessed through the scheduler from Barbora login nodes `barbora.it4i.cz` as a compute node cn202.
The DGX-2 machine is integrated into [Barbora cluster][3].
The DGX-2 machine can be accessed from Barbora login nodes `barbora.it4i.cz` through the Barbora scheduler queue qdgx as a compute node cn202.


## Storage
## Storage


@@ -32,3 +33,4 @@ For more information on accessing PROJECT, its quotas, etc., see the [PROJECT Da


[1]: ../../barbora/storage/#home-file-system
[1]: ../../barbora/storage/#home-file-system
[2]: ../../storage/project-storage
[2]: ../../storage/project-storage
[3]: ../../barbora/introduction
Original line number Original line Diff line number Diff line
# NVIDIA DGX-2
# Introduction


The DGX-2 is a very powerful computational node, featuring high end x86_64 processors and 16 NVIDIA V100-SXM3 GPUs.
NVIDIA DGX-2 is a very powerful computational node, featuring high end x86_64 processors and 16 NVIDIA V100-SXM3 GPUs.


| NVIDIA DGX-2  | |
| NVIDIA DGX-2  | |
| --- | --- |
| --- | --- |
Original line number Original line Diff line number Diff line
@@ -2,38 +2,24 @@


To run a job, computational resources of DGX-2 must be allocated.
To run a job, computational resources of DGX-2 must be allocated.


## Resources Allocation Policy
The DGX-2 machine is integrated to and accessible through Barbora cluster, the queue for the DGX-2 machine is called **qdgx**.

The resources are allocated to the job in a fair-share fashion, subject to constraints set by the queue. The queue provides prioritized and exclusive access to computational resources.

The queue for the DGX-2 machine is called **qdgx**.

!!! note
    The qdgx queue is configured to run one job and accept one job in a queue per user with the maximum walltime of a job being **48** hours.

## Job Submission and Execution

The `qsub` submits the job into the queue. The command creates a request to the PBS Job manager for allocation of specified resources. The resources will be allocated when available, subject to allocation policies and constraints. After the resources are allocated, the jobscript or interactive shell is executed on the allocated node.

### Job Submission


When allocating computational resources for the job, specify:
When allocating computational resources for the job, specify:


1. a queue for your job (the default is **qdgx**);
1. your Project ID
1. the maximum wall time allocated to your calculation (default is **4 hour**, maximum is **48 hour**);
1. a queue for your job - **qdgx**;
1. a jobscript or interactive switch.
1. the maximum time allocated to your calculation (default is **4 hour**, maximum is **48 hour**);

1. a jobscript if batch processing is intended.
!!! info
    You can access the DGX PBS scheduler by loading the "DGX-2" module.


Submit the job using the `qsub` command:
Submit the job using the `sbatch` (for batch processing) or `salloc` (for interactive session) command:


**Example**
**Example**


```console
```console
[kru0052@login2.barbora ~]$ qsub -q qdgx -l walltime=02:00:00 -I
[kru0052@login2.barbora ~]$ salloc -A PROJECT-ID -p qdgx --time=02:00:00
qsub: waiting for job 258.dgx to start
salloc: Granted job allocation 36631
qsub: job 258.dgx ready
salloc: Waiting for resource configuration
salloc: Nodes cn202 are ready for job


kru0052@cn202:~$ nvidia-smi
kru0052@cn202:~$ nvidia-smi
Wed Jun 16 07:46:32 2021
Wed Jun 16 07:46:32 2021
@@ -95,9 +81,9 @@ kru0052@cn202:~$ exit
```
```


!!! tip
!!! tip
    Submit the interactive job using the `qsub -I ...` command.
    Submit the interactive job using the `salloc` command.


### Job Execution
## Job Execution


The DGX-2 machine runs only a bare-bone, minimal operating system. Users are expected to run
The DGX-2 machine runs only a bare-bone, minimal operating system. Users are expected to run
**[Apptainer/Singularity][1]** containers in order to enrich the environment according to the needs.
**[Apptainer/Singularity][1]** containers in order to enrich the environment according to the needs.
@@ -107,12 +93,13 @@ Containers (Docker images) optimized for DGX-2 may be downloaded from
copy the docker nvcr.io link from the Pull Command section. This link may be directly used
copy the docker nvcr.io link from the Pull Command section. This link may be directly used
to download the container via Apptainer/Singularity, see the example below:
to download the container via Apptainer/Singularity, see the example below:


#### Example - Apptainer/Singularity Run Tensorflow
### Example - Apptainer/Singularity Run Tensorflow


```console
```console
[kru0052@login2.barbora ~]$ qsub -q qdgx -l walltime=01:00:00 -I
[kru0052@login2.barbora ~] $ salloc -A PROJECT-ID -p qdgx --time=02:00:00
qsub: waiting for job 96.dgx to start
salloc: Granted job allocation 36633
qsub: job 96.dgx ready
salloc: Waiting for resource configuration
salloc: Nodes cn202 are ready for job


kru0052@cn202:~$ singularity shell docker://nvcr.io/nvidia/tensorflow:19.02-py3
kru0052@cn202:~$ singularity shell docker://nvcr.io/nvidia/tensorflow:19.02-py3
Singularity tensorflow_19.02-py3.sif:~>
Singularity tensorflow_19.02-py3.sif:~>
Original line number Original line Diff line number Diff line
@@ -2,7 +2,9 @@


DICE (Data Infrastructure Capacity for EOSC) is an international project funded by the European Union
DICE (Data Infrastructure Capacity for EOSC) is an international project funded by the European Union
that provides cutting-edge data management services and a significant amount of storage resources for the EOSC.
that provides cutting-edge data management services and a significant amount of storage resources for the EOSC.
The EOSC (European Open Science Cloud) project provides European researchers, innovators, companies, and citizens with a federated and open multi-disciplinary environment where they can publish, find, and re-use data, tools, and services for research, innovation and educational purposes.
The EOSC (European Open Science Cloud) project provides European researchers, innovators, companies,
and citizens with a federated and open multi-disciplinary environment
where they can publish, find, and re-use data, tools, and services for research, innovation and educational purposes.


For more information, see the official [DICE project][b] and [EOSC project][q] pages.
For more information, see the official [DICE project][b] and [EOSC project][q] pages.


Original line number Original line Diff line number Diff line
@@ -2,7 +2,9 @@


## Shells on Clusters
## Shells on Clusters


The table shows which shells are supported on the IT4Innovations clusters.
The table shows which shells are available on the IT4Innovations clusters.

Note that bash is the only supported shell.


| Cluster Name    | bash | tcsh | zsh | ksh | dash |
| Cluster Name    | bash | tcsh | zsh | ksh | dash |
| --------------- | ---- | ---- | --- | --- | ---- |
| --------------- | ---- | ---- | --- | --- | ---- |
@@ -11,7 +13,7 @@ The table shows which shells are supported on the IT4Innovations clusters.
| DGX-2           | yes  | no   | no  | no  | no   |
| DGX-2           | yes  | no   | no  | no  | no   |


!!! info
!!! info
    BASH is the default shell. Should you need a different shell, contact [support\[at\]it4i.cz][3].
    Bash is the default shell. Should you need a different shell, contact [support\[at\]it4i.cz][3].


## Environment Customization
## Environment Customization


@@ -24,7 +26,7 @@ After logging in, you may want to configure the environment. Write your preferre
export MODULEPATH=${MODULEPATH}:/home/$USER/.local/easybuild/modules/all
export MODULEPATH=${MODULEPATH}:/home/$USER/.local/easybuild/modules/all


# User specific aliases and functions
# User specific aliases and functions
alias qs='qstat -a'
alias sq='squeue --me'


# load default intel compilator !!! is not recommended !!!
# load default intel compilator !!! is not recommended !!!
ml intel
ml intel
@@ -37,7 +39,7 @@ fi
```
```


!!! note
!!! note
    Do not run commands outputting to standard output (echo, module list, etc.) in .bashrc for non-interactive SSH sessions. It breaks the fundamental functionality (SCP, PBS) of your account. Take care for SSH session interactivity for such commands as stated in the previous example.
    Do not run commands outputting to standard output (echo, module list, etc.) in .bashrc for non-interactive SSH sessions. It breaks the fundamental functionality (SCP) of your account. Take care for SSH session interactivity for such commands as stated in the previous example.


### Application Modules
### Application Modules


Original line number Original line Diff line number Diff line
@@ -9,15 +9,25 @@ IT4Innovations has become a member of e-INFRA CZ on January 2022.


## Request e-INFRA CZ Account
## Request e-INFRA CZ Account


1. Request an account:
    1. Go to [https://signup.e-infra.cz/fed/registrar/?vo=IT4Innovations][2]
    1. Go to [https://signup.e-infra.cz/fed/registrar/?vo=IT4Innovations][2]
    1. Select a member academic institution you are affiliated with.
    1. Select a member academic institution you are affiliated with.
    1. Fill out the e-INFRA CZ Account information (username, password and ssh key(s)).
    1. Fill out the e-INFRA CZ Account information (username, password and ssh key(s)).


    Your account should be created in a few minutes after submitting the request.
    Your account should be created in a few minutes after submitting the request.

    Once your e-INFRA CZ account is created, it is propagated into IT4I systems
    Once your e-INFRA CZ account is created, it is propagated into IT4I systems
    and can be used to access [SCS portal][3] and [Request Tracker][4].
    and can be used to access [SCS portal][3] and [Request Tracker][4].


1. Provide additional information via [IT4I support][a] or email [support\[at\]it4i.cz][b] (**required**, note that without this information, you cannot use IT4I resources):
    1. **Full name**
    1. **Gender**
    1. **Citizenship**
    1. **Country of residence**
    1. **Organization/affiliation**
    1. **Organization/affiliation country**
    1. **Organization/affiliation type** (university, company, R&D institution, private/public sector (hospital, police), academy of sciences, etc.)
    1. **Job title**  (student, PhD student, researcher, research assistant, employee, etc.)

Continue to apply for a project or project membership to access clusters through the [SCS portal][3].
Continue to apply for a project or project membership to access clusters through the [SCS portal][3].


## Logging Into IT4I Services
## Logging Into IT4I Services
@@ -38,3 +48,6 @@ You can change you profile settings at any time.
[4]: https://support.it4i.cz/
[4]: https://support.it4i.cz/
[5]: ../../management/einfracz-profile.md
[5]: ../../management/einfracz-profile.md
[6]: https://www.eduid.cz/
[6]: https://www.eduid.cz/

[a]: https://support.it4i.cz/rt/
[b]: mailto:support@it4i.cz
Original line number Original line Diff line number Diff line
@@ -8,13 +8,43 @@ For more information, see the Open OnDemand [documentation][2].


## Access Open OnDemand
## Access Open OnDemand


!!! note
    Mate is currently available on Karolina only.

To access the OOD service, you must be connected to [IT4I VPN][a].
To access the OOD service, you must be connected to [IT4I VPN][a].
Then go to [https://ood-karolina.it4i.cz/][3] for Karolina or [https://ood-barbora.it4i.cz/][4] for Barbora and enter your e-INFRA CZ or IT4I credentials.
Then go to [https://ood-karolina.it4i.cz/][3] for Karolina
or [https://ood-barbora.it4i.cz/][4] for Barbora and enter your e-INFRA CZ or IT4I credentials.

From the top menu bar, you can manage your files and jobs, access the cluster's shell
and launch interactive apps on login nodes.


From the top menu bar, you can manage your files and jobs, access the cluster's shell and launch interactive apps on login nodes - Mate & XFCE desktops.
## OOD Apps on IT4I Clusters

!!! note
    Barbora OOD offers Mate and XFCE Desktops on login node only. Other applications listed below are exclusive to Karolina OOD.

* Desktops
    * Karolina Login Mate
    * Karolina Login XFCE
    * Gnome Desktop
* GUIs
    * Ansys
    * Blender
    * ParaView
    * TorchStudio
* Servers
    * Code Server
    * Jupyter (+IJulia)
    * MATLAB
    * TensorBoard
* Simulation
    * Code Aster

Depending on a selected application, you can set up various properties;
e.g. partition, number of nodes, tasks per node reservation, etc.

For `qgpu` partitions, you can select the number of GPUs.

![Ansys app in OOD GUI](../../../img/ood-ansys.png)

## Job Composer Tutorial


Under *Jobs > Job Composer*, you can create jobs from several sources.
Under *Jobs > Job Composer*, you can create jobs from several sources.
A simple tutorial will guide you through the process.
A simple tutorial will guide you through the process.
Original line number Original line Diff line number Diff line
@@ -227,10 +227,10 @@ Open a Terminal (_Applications -> System Tools -> Terminal_). Run all the follow


Allow incoming X11 graphics from the compute nodes at the login node:
Allow incoming X11 graphics from the compute nodes at the login node:


Get an interactive session on a compute node (for more detailed info [look here][4]). Forward X11 system using `X` option:
Get an interactive session on a compute node (for more detailed info [look here][4]). Forward X11 system using `--x11` option:


```console
```console
$ qsub -I -X -A PROJECT_ID -q qprod -l select=1:ncpus=36
$ salloc -A PROJECT_ID -p qcpu --x11
```
```


Test that the DISPLAY redirection into your VNC session works, by running an X11 application (e.g. XTerm, Intel Advisor, etc.) on the assigned compute node:
Test that the DISPLAY redirection into your VNC session works, by running an X11 application (e.g. XTerm, Intel Advisor, etc.) on the assigned compute node:
@@ -249,10 +249,10 @@ For a [better performance][1] an SSH connection can be used.


Open two Terminals (_Applications -> System Tools -> Terminal_) as described before.
Open two Terminals (_Applications -> System Tools -> Terminal_) as described before.


Get an interactive session on a compute node (for more detailed info [look here][4]). Forward X11 system using `X` option:
Get an interactive session on a compute node (for more detailed info [look here][4]). Forward X11 system using `--x11` option:


```console
```console
$ qsub -I -X -A PROJECT_ID -q qprod -l select=1:ncpus=36
$ salloc -A PROJECT_ID -p qcpu --x11
```
```


In the second terminal connect to the assigned node and run the X11 application
In the second terminal connect to the assigned node and run the X11 application
Original line number Original line Diff line number Diff line
@@ -99,21 +99,21 @@ In this example, we activate the Intel programing environment tools and then sta


## GUI Applications on Compute Nodes
## GUI Applications on Compute Nodes


Allocate the compute nodes using the `-X` option on the `qsub` command:
Allocate the compute nodes using the `--x11` option on the `salloc` command:


```console
```console
$ qsub -q qexp -l select=2:ncpus=24 -X -I
$ salloc -A PROJECT-ID -q qcpu_exp --x11
```
```


In this example, we allocate 2 nodes via qexp queue, interactively. We request X11 forwarding with the `-X` option. It will be possible to run the GUI enabled applications directly on the first compute node.
In this example, we allocate one node via qcpu_exp queue, interactively. We request X11 forwarding with the `--x11` option. It will be possible to run the GUI enabled applications directly on the first compute node.


For **better performance**, log on the allocated compute node via SSH, using the `-X` option.
For **better performance**, log on the allocated compute node via SSH, using the `-X` option.


```console
```console
$ ssh -X r24u35n680
$ ssh -X cn245
```
```


In this example, we log on the r24u35n680 compute node, with the X11 forwarding enabled.
In this example, we log on the cn245 compute node, with the X11 forwarding enabled.


## Gnome GUI Environment
## Gnome GUI Environment


@@ -143,7 +143,7 @@ xinit /usr/bin/ssh -XT -i .ssh/path_to_your_key yourname@cluster-namen.it4i.cz g
```
```


However, this method does not seem to work with recent Linux distributions and you will need to manually source
However, this method does not seem to work with recent Linux distributions and you will need to manually source
/etc/profile to properly set environment variables for PBS.
/etc/profile to properly set environment variables for Slurm.


### Gnome on Windows
### Gnome on Windows


Original line number Original line Diff line number Diff line
@@ -28,7 +28,7 @@ Some applications (e.g. Paraview, Ensight, Blender, Ovito) require not only visu
1. Run interactive job in gnome terminal
1. Run interactive job in gnome terminal


    ```console
    ```console
    [loginX.karolina]$ qsub -q qnvidia -l select=1 -IX -A OPEN-XX-XX -l xorg=True
    [loginX.karolina]$ salloc --A PROJECT-ID -q qgpu --x11 --comment use:xorg=true
    ```
    ```


1. Run Xorg server
1. Run Xorg server
@@ -82,7 +82,7 @@ Some applications (e.g. Paraview, Ensight, Blender, Ovito) require not only visu
1. Run job from terminal:
1. Run job from terminal:


    ```console
    ```console
    [loginX.karolina]$ qsub -q qnvidia -l select=1 -A OPEN-XX-XX -l xorg=True ./run_eevee.sh
    [loginX.karolina]$ sbatch -A PROJECT-ID -q qcpu --comment use:xorg=true ./run_eevee.sh
    ```
    ```


[1]: ./vnc.md
[1]: ./vnc.md
Original line number Original line Diff line number Diff line
@@ -10,7 +10,7 @@ SSH uses public-private key pair for authentication, allowing users to log in wi


A private key file in the `id_rsa` or `*.ppk` format is present locally on local side and used for example in the Pageant SSH agent (for Windows users). The private key should always be kept in a safe place.
A private key file in the `id_rsa` or `*.ppk` format is present locally on local side and used for example in the Pageant SSH agent (for Windows users). The private key should always be kept in a safe place.


An example of private key format:
### Example of RSA Private Key Format


```console
```console
    -----BEGIN RSA PRIVATE KEY-----
    -----BEGIN RSA PRIVATE KEY-----
@@ -42,16 +42,45 @@ An example of private key format:
    -----END RSA PRIVATE KEY-----
    -----END RSA PRIVATE KEY-----
```
```


### Example of Ed25519 Private Key Format

```console
PuTTY-User-Key-File-3: ssh-ed25519
Encryption: aes256-cbc
Comment: eddsa-key-20240910
Public-Lines: 2
AAAAC3NzaC1lZDI1NTE5AAAAIBKNwqaWU260wueN00nBGRwIqeOedRedtS0T7QVn
h0i2
Key-Derivation: Argon2id
Argon2-Memory: 8192
Argon2-Passes: 21
Argon2-Parallelism: 1
Argon2-Salt: bb64fc32b368aa16d6e8159c8d921f63
Private-Lines: 1
+7StvvEmCMchEy1tUyIMLfGTZBk7dgGUpJEJzNl82qmNZD1TmQOqNmCRiK84P/TL
Private-MAC: dc3f83cef42026a2038f28e96f87367d762e72265621d82e2fe124634ec3c905
```

## Public Key
## Public Key


A public key file in the `*.pub` format is present on the remote side and allows an access to the owner of the matching private key.
A public key file in the `*.pub` format is present on the remote side and allows an access to the owner of the matching private key.


An example of public key format:
### Example of RSA Public Key Format


```console
```console
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpujuOiTKCcGkbbBhrk0Hjmezr5QpM0swscXQE7fOZG0oQSURoapd9tjC9eVy5FvZ339jl1WkJkdXSRtjc2G1U5wQh77VE5qJT0ESxQCEw0S+CItWBKqXhC9E7gFY+UyP5YBZcOneh6gGHyCVfK6H215vzKr3x+/WvWl5gZGtbf+zhX6o4RJDRdjZPutYJhEsg/qtMxcCtMjfm/dZTnXeafuebV8nug3RCBUflvRb1XUrJuiX28gsd4xfG/P6L/mNMR8s4kmJEZhlhxpj8Th0iIc+XciVtXuGWQrbddcVRLxAmvkYAPGnVVOQeNj69pqAR/GXaFAhvjYkseEowQao1 username@organization.example.com
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpujuOiTKCcGkbbBhrk0Hjmezr5QpM0swscXQE7fOZG0oQSURoapd9tjC9eVy5FvZ339jl1WkJkdXSRtjc2G1U5wQh77VE5qJT0ESxQCEw0S+CItWBKqXhC9E7gFY+UyP5YBZcOneh6gGHyCVfK6H215vzKr3x+/WvWl5gZGtbf+zhX6o4RJDRdjZPutYJhEsg/qtMxcCtMjfm/dZTnXeafuebV8nug3RCBUflvRb1XUrJuiX28gsd4xfG/P6L/mNMR8s4kmJEZhlhxpj8Th0iIc+XciVtXuGWQrbddcVRLxAmvkYAPGnVVOQeNj69pqAR/GXaFAhvjYkseEowQao1 username@organization.example.com
```
```


### Example of Ed25519 Public Key Format

```console
---- BEGIN SSH2 PUBLIC KEY ----
Comment: "eddsa-key-20240910"
AAAAC3NzaC1lZDI1NTE5AAAAIBKNwqaWU260wueN00nBGRwIqeOedRedtS0T7QVn
h0i2
---- END SSH2 PUBLIC KEY ----
```

## SSH Key Management
## SSH Key Management


You can manage your own SSH key for authentication to clusters:
You can manage your own SSH key for authentication to clusters:
Original line number Original line Diff line number Diff line
@@ -5,7 +5,7 @@
To generate a new keypair of your public and private key, use the `ssh-keygen` tool:
To generate a new keypair of your public and private key, use the `ssh-keygen` tool:


```console
```console
local $ ssh-keygen -C 'username@organization.example.com' -f additional_key
local $ ssh-keygen -t ed25519 -C username@organization.example.com' -f additional_key
```
```


!!! note
!!! note
@@ -14,6 +14,26 @@ local $ ssh-keygen -C 'username@organization.example.com' -f additional_key
By default, your private key is saved to the `id_rsa` file in the `.ssh` directory
By default, your private key is saved to the `id_rsa` file in the `.ssh` directory
and your public key is saved to the `id_rsa.pub` file.
and your public key is saved to the `id_rsa.pub` file.


## Adding SSH Key to Linux System SSH Agent

1. Check if SSH Agent is running:

    ```
    eval "$(ssh-agent -s)"
    ```

1. Add the key to SSH Agent:

    ```
    ssh-add ~/.ssh/name_of_your_ssh_key_file
    ```

1. Verify the key Added to SSH Agent:

    ```
    ssh-add -l
    ```

## Managing Your SSH Key
## Managing Your SSH Key


To manage your SSH key for authentication to clusters, see the [SSH Key Management][1] section.
To manage your SSH key for authentication to clusters, see the [SSH Key Management][1] section.
Original line number Original line Diff line number Diff line
@@ -2,17 +2,26 @@


## Accessing IT4Innovations Internal Resources via VPN
## Accessing IT4Innovations Internal Resources via VPN


To access IT4Innovations' resources and licenses, it is necessary to connect to its local network via VPN. IT4Innovations uses the FortiClient VPN software. For the list of supported operating systems, see the [FortiClient Administration Guide][a].
To access IT4Innovations' resources and licenses, it is necessary to connect to its local network via VPN.
IT4Innovations uses the FortiClient VPN software.
For the list of supported operating systems, see the [FortiClient Administration Guide][a].

!!! Note "Realms"
    If you are member of a partner organization, we may ask you to use so called realm in your VPN connection. In the Remote Gateway field, include the realm path after the IP address or hostname. For example, for a realm `excellent`, the field would read as follows `reconnect.it4i.cz:443/excellent`.


## VPN Client Download
## VPN Client Download


* Windows: Download the FortiClient app from the [Windows Store][b].
* Windows: Download the **FortiClient VPN-only** app from the [official page][g] (Microsoft Store app is not recommended).
* Mac: Download the FortiClient VPN app from the [Apple Store][d].
* Mac: Download the **FortiClient VPN** app from the [Apple Store][d].
* Linux: Download the [FortiClient][e] or [OpenFortiVPN][f] app.
* Linux: Download the [FortiClient][e] or [OpenFortiVPN][f] app.


## Working With Windows/Mac VPN Client
## Working With Windows/Mac VPN Client


Before the first login, you must configure the VPN. In the New VPN Connection section, provide the name of your VPN connection and the following settings:
!!! Tip "Instructional video for Mac"
    See [the instructional video][h] on how to download the VPN client and connect to the IT4I VPN on Mac.

Before the first login, you must configure the VPN.
In the New VPN Connection section, provide the name of your VPN connection and the following settings:


Name                | Value
Name                | Value
:-------------------|:------------------
:-------------------|:------------------
@@ -27,9 +36,6 @@ Optionally, you can describe the VPN connection and select Save Login under Auth


Save the settings, enter your login credentials and click Connect.
Save the settings, enter your login credentials and click Connect.


!!! note
    Make sure your username and password are correct. If you enter invalid credentials, FortiClient VPN returns a general warning (-14).

![](../../img/fc_vpn_web_login_3_1.png)
![](../../img/fc_vpn_web_login_3_1.png)


## Linux Client
## Linux Client
@@ -44,14 +50,16 @@ Set-Routes | Enabled
Set-DNS      | Enabled
Set-DNS      | Enabled
DNS Servers  | 10.5.8.11, 10.5.8.22
DNS Servers  | 10.5.8.11, 10.5.8.22


Linux VPN clients need to run under root. OpenFortiGUI uses sudo by default, be sure, that your user is allowed to use sudo.
Linux VPN clients need to run under root.
OpenFortiGUI uses sudo by default; be sure that your user is allowed to use sudo.


[1]: ../../general/obtaining-login-credentials/obtaining-login-credentials.md#login-credentials
[1]: ../../general/obtaining-login-credentials/obtaining-login-credentials.md#login-credentials
[2]: ../../general/access/einfracz-account.md
[2]: ../../general/access/einfracz-account.md


[a]: http://docs.fortinet.com/document/forticlient/latest/administration-guide/646779/installation-requirements
[a]: http://docs.fortinet.com/document/forticlient/latest/administration-guide/646779/installation-requirements
[b]: https://apps.microsoft.com/store/detail/forticlient/9WZDNCRDH6MC?hl=en-us&gl=us
[c]: https://github.com/theinvisible/openfortigui
[c]: https://github.com/theinvisible/openfortigui
[d]: https://apps.apple.com/cz/app/forticlient-vpn/id1475674905?l=cs
[d]: https://apps.apple.com/cz/app/forticlient-vpn/id1475674905?l=cs
[e]: https://www.fortinet.com/support/product-downloads/linux
[e]: https://www.fortinet.com/support/product-downloads/linux
[f]: https://github.com/adrienverge/openfortivpn
[f]: https://github.com/adrienverge/openfortivpn
[g]: https://www.fortinet.com/support/product-downloads#vpn
[h]: https://www.youtube.com/watch?v=xGcROEreop8
 No newline at end of file
Original line number Original line Diff line number Diff line
@@ -8,7 +8,6 @@ The computational resources of IT4I are allocated by the Allocation Committee vi


* Academic researchers may apply via Open Access Competitions.
* Academic researchers may apply via Open Access Competitions.
* Commercial and non-commercial institutions may also apply via the Directors Discretion.
* Commercial and non-commercial institutions may also apply via the Directors Discretion.
* Foreign (mostly European) users can obtain computational resources via the [PRACE (DECI) program][d].


In all cases, IT4Innovations’ access mechanisms are aimed at distributing computational resources while taking into account the development and application of supercomputing methods and their benefits and usefulness for society. The applicants are expected to submit a proposal. In the proposal, the applicants **apply for a particular amount of core-hours** of computational resources. The requested core-hours should be substantiated by scientific excellence of the proposal, its computational maturity and expected impacts. The allocation decision is based on the scientific, technical, and economic evaluation of the proposal.
In all cases, IT4Innovations’ access mechanisms are aimed at distributing computational resources while taking into account the development and application of supercomputing methods and their benefits and usefulness for society. The applicants are expected to submit a proposal. In the proposal, the applicants **apply for a particular amount of core-hours** of computational resources. The requested core-hours should be substantiated by scientific excellence of the proposal, its computational maturity and expected impacts. The allocation decision is based on the scientific, technical, and economic evaluation of the proposal.


+4 −0
Original line number Original line Diff line number Diff line
# Acceptable Use Policy

![PDF presentation on Slurm Batch Jobs Examples](../general/AUP-final.pdf){ type=application/pdf style="min-height:100vh;width:100%" }
+32 −0
Original line number Original line Diff line number Diff line
---
hide:

- toc

---

# Barbora Partitions

!!! important
    Active [project membership][1] is required to run jobs.

Below is the list of partitions available on the Barbora cluster:

| Partition        | Project resources    | Nodes                      | Min ncpus | Priority | Authorization | Walltime (def/max) |
| ---------------- | -------------------- | -------------------------- | --------- | -------- | ------------- | ------------------ |
| **qcpu**         | > 0                  | 190                        | 36        | 2        | no            | 24 / 48h           |
| **qcpu_biz**     | > 0                  | 190                        | 36        | 3        | no            | 24 / 48h           |
| **qcpu_exp**     | < 150% of allocation | 16                         | 36        | 4        | no            | 1 / 1h             |
| **qcpu_free**    | < 150% of allocation | 124<br>max 4 per job       | 36        | 1        | no            | 12 / 18h           |
| **qcpu_long**    | > 0                  | 60<br>max 20 per job       | 36        | 2        | no            | 72 / 144h          |
| **qcpu_preempt** | active Barbora<br>CPU alloc. | 190<br>max 4 per job       | 36        | 0        | no            | 12 / 12h           |
| **qgpu**         | > 0                  | 8                          | 24        | 2        | yes           | 24 / 48h           |
| **qgpu_biz**     | > 0                  | 8                          | 24        | 3        | yes           | 24 / 48h           |
| **qgpu_exp**     | < 150% of allocation | 4<br>max 1 per job         | 24        | 4        | no            | 1 / 1h             |
| **qgpu_free**    | < 150% of allocation | 5<br>max 2 per job         | 24        | 1        | no            | 12 / 18h           |
| **qgpu_preempt** | active Barbora<br>GPU alloc. | 4<br>max 2 per job         | 24        | 0        | no            | 12 / 12h           |
| **qdgx**         | > 0                  | cn202                      | 96        | 2        | yes           | 4 / 48h            |
| **qviz**         | > 0                  | 2 with NVIDIA Quadro P6000 | 4         | 2        | no            | 1 / 8h             |
| **qfat**         | > 0                  | 1 fat node                 | 128       | 2        | yes           | 24 / 48h           |

[1]: access/project-access.md
+0 −26
Original line number Original line Diff line number Diff line
# Barbora Queues

Below is the list of queues available on the Barbora cluster:

| Queue            | Active project | Project resources    | Nodes                                                         | Min ncpus | Priority | Authorization | Walltime (default/max)  |
| ---------------- | -------------- | -------------------- | -------------------------------- | --------- | -------- | ------------- | ---------------------- |
| **qcpu**         | yes            | > 0                  | 190 nodes                        | 36        | 0        | no            | 24 / 48h               |
| **qcpu_biz**     | yes            | > 0                  | 190 nodes                        | 36        | 50       | no            | 24 / 48h               |
| **qcpu_exp**     | yes            | none required        | 16 nodes                         | 36        | 150      | no            | 1 / 1h                 |
| **qcpu_free**    | yes            | < 150% of allocation | 124 nodes<br>max 4 nodes per job | 36        | -100     | no            | 12 / 18h               |
| **qcpu_long**    | yes            | > 0                  | 60 nodes<br>max 20 nodes per job | 36        | 0        | no            | 72 / 144h              |
| **qcpu_preempt** | yes            | > 0                  | 190 nodes<br>max 4 nodes per job | 36        | -200     | no            | 12 / 12h               |
| **qgpu**         | yes            | > 0                  | 8 nodes                          | 24        | 0        | yes           | 24 / 48h               |
| **qgpu_biz**     | yes            | > 0                  | 8 nodes                          | 24        | 50       | yes           | 24 / 48h               |
| **qgpu_exp**     | yes            | none required        | 4 nodes<br>max 1 node per job    | 24        | 0        | no            | 1 / 1h                 |
| **qgpu_free**    | yes            | < 150% of allocation | 5 nodes<br>max 2 nodes per job   | 24        | -100     | no            | 12 / 18h               |
| **qgpu_preempt** | yes            | > 0                  | 4 nodes<br>max 2 nodes per job   | 24        | -200     | no            | 12 / 12h               |
| **qdgx**         | yes            | > 0                  | cn202                            | 96        | 0        | yes           | 4 / 48h                |
| **qviz**         | yes            | none required        | 2 nodes with NVIDIA Quadro P6000 | 4         | 0        | no            | 1 / 8h                 |
| **qfat**         | yes            | > 0                  | 1 fat node                       | 128       | 0        | yes           | 24 / 48h               |
| **Legacy Queues**                 |
| **qexp**         | no             | none required        | 16 nodes<br>max 4 nodes per job  | 36        | 150      | no            | 1 / 1h                 |
| **qprod**        | yes            | > 0                  | 190 nodes w/o accelerator        | 36        | 0        | no            | 24 / 48h               |
| **qlong**        | yes            | > 0                  | 60 nodes w/o accelerator<br>max 20 nodes per job     | 36        | 0        | no            | 72 / 144h              |
| **qnvidia**      | yes            | > 0                  | 8 NVIDIA nodes                   | 24        | 0        | yes           | 24 / 48h               |
| **qfree**        | yes            | < 150% of allocation | 192 w/o accelerator<br>max 32 nodes per job  | 36       | -100    | no            | 12 / 12h     |
Original line number Original line Diff line number Diff line
@@ -2,20 +2,26 @@


## Introduction
## Introduction


In many cases, it is useful to submit a huge (>100) number of computational jobs into the PBS queue system. A huge number of (small) jobs is one of the most effective ways to execute embarrassingly parallel calculations, achieving the best runtime, throughput, and computer utilization.
In many cases, it is useful to submit a huge number of computational jobs into the Slurm queue system.
A huge number of (small) jobs is one of the most effective ways to execute embarrassingly parallel calculations,
achieving the best runtime, throughput, and computer utilization. This is called **Capacity Computing**


However, executing a huge number of jobs via the PBS queue may strain the system. This strain may result in slow response to commands, inefficient scheduling, and overall degradation of performance and user experience for all users. For this reason, the number of jobs is **limited to 100 jobs per user, 4,000 jobs and subjobs per user, 1,500 subjobs per job array**.
However, executing a huge number of jobs via the Slurm queue may strain the system. This strain may
result in slow response to commands, inefficient scheduling, and overall degradation of performance
and user experience for all users.  
We **recommend** using [**Job arrays**][1] or [**HyperQueue**][2] to execute many jobs.


!!! note
There are two primary scenarios:
    Follow one of the procedures below, in case you wish to schedule more than 100 jobs at a time.


* Use [Job arrays][1] when running a huge number of multithread (bound to one node only) or multinode (multithread across several nodes) jobs.
1. Number of jobs < 1500, **and** the jobs are able to utilize one or more **full** nodes:  
* Use [HyperQueue][3] when running a huge number of multithread jobs. HyperQueue can help overcome the limits of job arrays.
    Use [**Job arrays**][1].  
    The Job array allows to submit and control up to 1500 jobs (tasks) in one packet. Several job arrays may be submitted.


## Policy
2. Number of jobs >> 1500, **or** the jobs only utilize a **few cores/accelerators** each:  
    Use [**HyperQueue**][2].  
    HyperQueue can help efficiently load balance a very large number of jobs (tasks) amongst available computing nodes.
    HyperQueue may be also used if you have dependencies among the jobs.


1. A user is allowed to submit at most 100 jobs. Each job may be [a job array][1].
1. The array size is at most 1,000 subjobs.


[1]: job-arrays.md
[1]: job-arrays.md
[3]: hyperqueue.md
[2]: hyperqueue.md
 No newline at end of file
Original line number Original line Diff line number Diff line
# Energy Saving
# Energy Saving


Due to high energy prices and reductions in funding, IT4Innovations has implemented a set of energy saving measures on the supercomputing clusters. The measures are selected to minimize the performance impact and achieve significant cost, energy, and carbon footprint reduction effect.
IT4Innovations has implemented a set of energy saving measures on the supercomputing clusters. The measures are selected to minimize the performance impact and achieve significant cost, energy, and carbon footprint reduction effect.


The energy saving measures are effective as of **1.2.2023**.
The energy saving measures are effective as of **1.2.2023**.


Original line number Original line Diff line number Diff line
# HyperQueue
# HyperQueue


HyperQueue lets you build a computation plan consisting of a large amount of tasks and then execute it transparently over a system like SLURM/PBS.
HyperQueue lets you build a computation plan consisting of a large amount of tasks and then execute it transparently over a system like SLURM/PBS.
It dynamically groups tasks into PBS jobs and distributes them to fully utilize allocated nodes.
It dynamically groups tasks into Slurm jobs and distributes them to fully utilize allocated nodes.
You thus do not have to manually aggregate your tasks into PBS jobs.
You thus do not have to manually aggregate your tasks into Slurm jobs.


Find more about HyperQueue in its [documentation][a].
Find more about HyperQueue in its [documentation][a].


@@ -87,35 +87,35 @@ $ hq jobs


Before HyperQueue can execute your jobs, it needs to have access to some computational resources.
Before HyperQueue can execute your jobs, it needs to have access to some computational resources.
You can provide these by starting HyperQueue *workers* which connect to the server and execute your jobs.
You can provide these by starting HyperQueue *workers* which connect to the server and execute your jobs.
The workers should run on computing nodes, therefore they should be started inside PBS jobs.
The workers should run on computing nodes, therefore they should be started inside Slurm jobs.


There are two ways of providing computational resources.
There are two ways of providing computational resources.


* **Allocate PBS jobs automatically**
* **Allocate Slurm jobs automatically**


    HyperQueue can automatically submit PBS jobs with workers on your behalf. This system is called
    HyperQueue can automatically submit Slurm jobs with workers on your behalf. This system is called
    [automatic allocation][c]. After the server is started, you can add a new automatic allocation
    [automatic allocation][c]. After the server is started, you can add a new automatic allocation
    queue using the `hq alloc add` command:
    queue using the `hq alloc add` command:


    ```console
    ```console
    $ hq alloc add pbs -- -qqprod -AAccount1
    $ hq alloc add slurm -- -A<PROJECT-ID> -p qcpu_exp
    ```
    ```


    After you run this command, HQ will automatically start submitting PBS jobs on your behalf
    After you run this command, HQ will automatically start submitting Slurm jobs on your behalf
    once some HQ jobs are submitted.
    once some HQ jobs are submitted.


* **Manually start PBS jobs with HQ workers**
* **Manually start Slurm jobs with HQ workers**


    With the following command, you can submit a PBS job that will start a single HQ worker which
    With the following command, you can submit a Slurm job that will start a single HQ worker which
    will connect to a running HQ server.
    will connect to a running HQ server.


    ```console
    ```console
    $ qsub <qsub-params> -- /bin/bash -l -c "$(which hq) worker start"
    $ salloc <salloc-params> -- /bin/bash -l -c "$(which hq) worker start"
    ```
    ```


!!! tip
!!! tip
    For debugging purposes, you can also start the worker e.g. on a login node, simply by running
    For debugging purposes, you can also start the worker e.g. on a login node, simply by running
    `$ hq worker start`. Do not use such worker for any long-running computations though.
    `$ hq worker start`. Do not use such worker for any long-running computations though!


## Architecture
## Architecture


Original line number Original line Diff line number Diff line
# Job Arrays
# Job Arrays


A job array is a compact representation of many jobs called subjobs. Subjobs share the same job script, and have the same values for all attributes and resources, with the following exceptions:
A job array is a compact representation of many jobs called tasks. Tasks share the same job script, and have the same values for all attributes and resources, with the following exceptions:


* each subjob has a unique index, $PBS_ARRAY_INDEX
* each task has a unique index, `$SLURM_ARRAY_TASK_ID`
* job Identifiers of subjobs only differ by their indices
* job Identifiers of tasks only differ by their indices
* the state of subjobs can differ (R, Q, etc.)
* the state of tasks can differ


All subjobs within a job array have the same scheduling priority and schedule as independent jobs. An entire job array is submitted through a single `qsub` command and may be managed by `qdel`, `qalter`, `qhold`, `qrls`, and `qsig` commands as a single job.
All tasks within a job array have the same scheduling priority and schedule as independent jobs. An entire job array is submitted through a single `sbatch` command and may be managed by `squeue`, `scancel` and `scontrol` commands as a single job.


## Shared Jobscript
## Shared Jobscript


All subjobs in a job array use the very same single jobscript. Each subjob runs its own instance of the jobscript. The instances execute different work controlled by the `$PBS_ARRAY_INDEX` variable.
All tasks in a job array use the very same single jobscript. Each task runs its own instance of the jobscript. The instances execute different work controlled by the `$SLURM_ARRAY_TASK_ID` variable.


Example:
Example:


Assume we have 900 input files with the name of each beginning with "file" (e.g. file001, ..., file900). Assume we would like to use each of these input files with myprog.x program executable, each as a separate job.
Assume we have 900 input files with the name of each beginning with "file" (e.g. file001, ..., file900). Assume we would like to use each of these input files with myprog.x program executable,
each as a separate, single node job running 128 threats.


First, we create a tasklist file (or subjobs list), listing all tasks (subjobs) - all input files in our example:
First, we create a `tasklist` file, listing all tasks - all input files in our example:


```console
```console
$ find . -name 'file*' > tasklist
$ find . -name 'file*' > tasklist
@@ -26,117 +27,74 @@ Then we create a jobscript:


```bash
```bash
#!/bin/bash
#!/bin/bash
#PBS -A OPEN-00-00
#SBATCH -p qcpu
#PBS -q qprod
#SBATCH -A SERVICE
#PBS -l select=1,walltime=02:00:00
#SBATCH --nodes 1 --ntasks-per-node 1 --cpus-per-task 128 
#SBATCH -t 02:00:00
#SBATCH -o /dev/null


# change to scratch directory
# change to scratch directory
SCRDIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/${PBS_JOBID}
SCRDIR=/scratch/project/$SLURM_JOB_ACCOUNT/$SLURM_JOB_USER/$SLURM_JOB_ID
mkdir -p $SCRDIR
mkdir -p $SCRDIR
cd $SCRDIR || exit
cd $SCRDIR || exit


# get individual tasks from tasklist with index from PBS JOB ARRAY
# get individual tasks from tasklist with index from SLURM JOB ARRAY
TASK=$(sed -n "${PBS_ARRAY_INDEX}p" $PBS_O_WORKDIR/tasklist)
TASK=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SLURM_SUBMIT_DIR/tasklist)


# copy input file and executable to scratch
# copy input file and executable to scratch
cp $PBS_O_WORKDIR/$TASK input
cp $SLURM_SUBMIT_DIR/$TASK input
cp $PBS_O_WORKDIR/myprog.x .
cp $SLURM_SUBMIT_DIR/myprog.x .


# execute the calculation
# execute the calculation
./myprog.x < input > output
./myprog.x < input > output


# copy output file to submit directory
# copy output file to submit directory
cp output $PBS_O_WORKDIR/$TASK.out
cp output $SLURM_SUBMIT_DIR/$TASK.out
```
```


In this example, the submit directory contains the 900 input files, the myprog.x executable, and the jobscript file. As an input for each run, we take the filename of the input file from the created tasklist file. We copy the input file to the local scratch memory `/lscratch/$PBS_JOBID`, execute the myprog.x and copy the output file back to the submit directory, under the `$TASK.out` name. The myprog.x executable runs on one node only and must use threads to run in parallel. Be aware, that if the myprog.x **is not multithreaded**, then all the **jobs are run as single-thread programs in a sequential manner**. Due to the allocation of the whole node, the accounted time is equal to the usage of the whole node, while using only 1/16 of the node.
In this example, the submit directory contains the 900 input files, the myprog.x executable,
and the jobscript file. As an input for each run, we take the filename of the input file from the created
tasklist file. We copy the input file to a scratch directory  `/scratch/project/$SLURM_JOB_ACCOUNT/$SLURM_JOB_USER/$SLURM_JOB_ID`,
execute the myprog.x and copy the output file back to the submit directory, under the `$TASK.out` name. The myprog.x executable runs on one node only and must use threads to run in parallel.
Be aware, that if the myprog.x **is not multithreaded or multi-process (MPI)**, then all the **jobs are run as single-thread programs, wasting node resources**.


If running a huge number of parallel multicore (in means of multinode multithread, e.g. MPI enabled) jobs is needed, then a job array approach should be used. The main difference, as compared to the previous examples using one node, is that the local scratch memory should not be used (as it is not shared between nodes) and MPI or other techniques for parallel multinode processing has to be used properly.
## Submitting Job Array


## Submiting Job Array
To submit the job array, use the `sbatch --array` command. The 900 jobs of the [example above][2] may be submitted like this:

To submit the job array, use the `qsub -J` command. The 900 jobs of the [example above][3] may be submitted like this:

```console
$ qsub -N JOBNAME -J 1-900 jobscript
506493[].isrv5
```

In this example, we submit a job array of 900 subjobs. Each subjob will run on one full node and is assumed to take less than 2 hours (note the #PBS directives in the beginning of the jobscript file, do not forget to set your valid PROJECT_ID and desired queue).

Sometimes for testing purposes, you may need to submit a one-element only array. This is not allowed by PBSPro, but there is a workaround:


```console
```console
$ qsub -N JOBNAME -J 9-10:2 jobscript
$ sbatch -J JOBNAME --array 1-900 ./jobscript
```
```


This will only choose the lower index (9 in this example) for submitting/running your job.
In this example, we submit a job array of 900 tasks. Each task will run on one full node and is assumed to take less than 2 hours (note the #SBATCH directives in the beginning of the jobscript file, do not forget to set your valid PROJECT_ID and desired queue).


## Managing Job Array
## Managing Job Array


Check status of the job array using the `qstat` command.
Check status of the job array using the `squeue --me` command, alternatively `squeue --me --array`.


```console
```console
$ qstat -a 12345[].dm2
$  squeue --me --long

          JOBID PARTITION     NAME   USER    STATE       TIME     TIME_LIMI  NODES NODELIST(REASON)
dm2:
2499924_[1-900]      qcpu  myarray   user  PENDING       0:00      02:00:00      1 (Resources)
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
12345[].dm2     user2    qprod    xx          13516   1 16    --  00:50 B 00:02
```
```

Check the status of the tasks using the `squeue` command.
When the status is B, it means that some subjobs are already running.
Check the status of the first 100 subjobs using the `qstat` command.


```console
```console
$ qstat -a 12345[1-100].dm2
$ squeue -j 2499924 --long

    JOBID PARTITION     NAME   USER    STATE       TIME     TIME_LIMI  NODES NODELIST(REASON)
dm2:
2499924_1      qcpu  myarray   user  PENDING       0:00      02:00:00      1 (Resources)
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
12345[1].dm2    user2    qprod    xx          13516   1 16    --  00:50 R 00:02
12345[2].dm2    user2    qprod    xx          13516   1 16    --  00:50 R 00:02
12345[3].dm2    user2    qprod    xx          13516   1 16    --  00:50 R 00:01
12345[4].dm2    user2    qprod    xx          13516   1 16    --  00:50 Q   --
     .             .        .      .             .    .   .     .    .   .    .
     .             .        .      .             .    .   .     .    .   .    .
     ,             .        .      .             .    .   .     .    .   .    .
     .             .        .      .             .    .   .     .    .   .    .
12345[100].dm2 user2    qprod    xx          13516   1 16    --  00:50 Q   --
2499924_900    qcpu  myarray   user  PENDING       0:00      02:00:00      1 (Resources)
```

Delete the entire job array. Running subjobs will be killed, queueing subjobs will be deleted.

```console
$ qdel 12345[].dm2
```

Deleting large job arrays may take a while.
Display status information for all user's jobs, job arrays, and subjobs.

```console
$ qstat -u $USER -t
```
```


Display status information for all user's subjobs.
Delete the entire job array. Running tasks will be killed, queueing tasks will be deleted.


```console
```console
$ qstat -u $USER -tJ
$ scancel 2499924
```
```


For more information on job arrays, see the [PBSPro Users guide][1].
For more information on job arrays, see the [SLURM guide][1].

## Examples

Download the examples in [capacity.zip][2], illustrating the above listed ways to run a huge number of jobs. We recommend trying out the examples before using this for running production jobs.

Unzip the archive in an empty directory on cluster and follow the instructions in the README file-

```console
$ unzip capacity.zip
$ cat README
```


[1]: ../pbspro.md
[1]: https://slurm.schedmd.com/job_array.html
[2]: capacity.zip
[2]: #shared-jobscript
[3]: #shared-jobscript
Original line number Original line Diff line number Diff line
# Job Scheduling
# Job Scheduling


## Job Execution Priority
## Job Priority


The scheduler gives each job an execution priority and then uses this job execution priority to select which job(s) to run.
The scheduler gives each job a priority and then uses this job priority to select which job(s) to run.


Job execution priority is determined by these job properties (in order of importance):
Job priority is determined by these job properties (in order of importance):


1. queue priority
1. queue priority
1. fair-share priority
1. fair-share priority
1. eligible time
1. job age/eligible time


### Queue Priority
### Queue Priority


Queue priority is the priority of the queue in which the job is waiting prior to execution.
Queue priority is the priority of the queue in which the job is waiting prior to execution.


Queue priority has the biggest impact on job execution priority. The execution priority of jobs in higher priority queues is always greater than the execution priority of jobs in lower priority queues. Other properties of jobs used for determining the job execution priority (fair-share priority, eligible time) cannot compete with queue priority.
Queue priority has the biggest impact on job priority. The priority of jobs in higher priority queues is always greater than the priority of jobs in lower priority queues. Other properties of jobs used for determining the job priority (fair-share priority, eligible time) cannot compete with queue priority.


Queue priorities can be seen [here][a].
Queue priorities can be seen [here][a].


@@ -24,35 +24,17 @@ Fair-share priority is calculated based on recent usage of resources. Fair-share


Fair-share priority is used for ranking jobs with equal queue priority.
Fair-share priority is used for ranking jobs with equal queue priority.


Fair-share priority is calculated as:
Usage decays, halving at intervals of 7 days.


---8<--- "fairshare_formula.md"
### Job Age/Eligible Time


where MAX_FAIRSHARE has the value of 1E6
The job age factor represents the length of time a job has been sitting in the queue and eligible to run.


usage<sub>Project</sub> is the usage accumulated by all members of a selected project
Job age has the least impact on priority.

usage<sub>Total</sub> is the total usage by all users, across all projects.

Usage counts allocated node-hours (`ncpus x walltime`). Usage decays, halving at intervals of 168 hours (one week).
Jobs queued in the queue qexp are not used to calculate the project's usage.

!!! note
    Calculated usage and fair-share priority can be seen [here][b].

Calculated fair-share priority can also be seen in the Resource_List.fairshare attribute of a job.

### Eligible Time

Eligible time is the amount of eligible time (in seconds) a job accrues while waiting to run. Jobs with higher eligible time gain higher priority.

Eligible time has the least impact on execution priority. Eligible time is used for sorting jobs with equal queue priority and fair-share priority. It is very, very difficult for eligible time to compete with fair-share priority.

Eligible time can be seen in the `eligible_time` attribute of a job.


### Formula
### Formula


Job execution priority (job sort formula) is calculated as:
Job priority is calculated as:


---8<--- "job_sort_formula.md"
---8<--- "job_sort_formula.md"


@@ -60,24 +42,44 @@ Job execution priority (job sort formula) is calculated as:


The scheduler uses job backfilling.
The scheduler uses job backfilling.


Backfilling means fitting smaller jobs around the higher-priority jobs that the scheduler is going to run next, in such a way that the higher-priority jobs are not delayed. Backfilling allows us to keep resources from becoming idle when the top job (the job with the highest execution priority) cannot run.
Backfilling means fitting smaller jobs around the higher-priority jobs that the scheduler is going to run next, in such a way that the higher-priority jobs are not delayed. Backfilling allows us to keep resources from becoming idle when the top job (the job with the highest priority) cannot run.


The scheduler makes a list of jobs to run in order of execution priority. The scheduler looks for smaller jobs that can fit into the usage gaps around the highest-priority jobs in the list. The scheduler looks in the prioritized list of jobs and chooses the highest-priority smaller jobs that fit. Filler jobs are run only if they will not delay the start time of top jobs.
The scheduler makes a list of jobs to run in order of priority. The scheduler looks for smaller jobs that can fit into the usage gaps around the highest-priority jobs in the list. The scheduler looks in the prioritized list of jobs and chooses the highest-priority smaller jobs that fit. Filler jobs are run only if they will not delay the start time of top jobs.


This means that jobs with lower execution priority can be run before jobs with higher execution priority.
This means that jobs with lower priority can be run before jobs with higher priority.


!!! note
!!! note
    It is **very beneficial to specify the walltime** when submitting jobs.
    It is **very beneficial to specify the timelimit** when submitting jobs.


Specifying more accurate walltime enables better scheduling, better execution times, and better resource usage. Jobs with suitable (small) walltime can be backfilled - and overtake job(s) with a higher priority.
Specifying more accurate timelimit enables better scheduling, better times, and better resource usage. Jobs with suitable (small) timelimit can be backfilled - and overtake job(s) with a higher priority.


---8<--- "mathjax.md"
---8<--- "mathjax.md"


### Job Placement
## Technical Details

Priorities are set using Slurm's [Multifactor Priority Plugin][1]. Current settings are as follows:

```
$ grep ^Priority /etc/slurm/slurm.conf
PriorityFlags=DEPTH_OBLIVIOUS
PriorityType=priority/multifactor
PriorityDecayHalfLife=7-0
PriorityMaxAge=14-0
PriorityWeightAge=100000
PriorityWeightFairshare=10000000
PriorityWeightPartition=1000000000
```

## Inspecting Job Priority

One can inspect job priority using `sprio` command. Job priority is in the field PRIORITY and it is comprised of PARTITION, FAIRSHARE and AGE priorities.


Job [placement can be controlled by flags during submission][1].
```
$ sprio -l -j 894782
          JOBID PARTITION     USER  ACCOUNT   PRIORITY       SITE        AGE      ASSOC  FAIRSHARE    JOBSIZE  PARTITION    QOSNAME        QOS        NICE                 TRES
         894782 qgpu         user1  service  300026688          0         17          0      26671          0  300000000     normal          0           0
```


[1]: job-submission-and-execution.md#advanced-job-placement
[1]: https://slurm.schedmd.com/priority_multifactor.html


[a]: https://extranet.it4i.cz/rsweb/barbora/queues
[a]: https://extranet.it4i.cz/rsweb/karolina/queues
[b]: https://extranet.it4i.cz/rsweb/barbora/projects
+1 −458
Original line number Original line Diff line number Diff line
# Job Submission and Execution

## Job Submission

When allocating computational resources for the job, specify:

1. a suitable queue for your job (the default is qprod)
1. the number of computational nodes (required)
1. the number of cores per node (not required)
1. the maximum wall time allocated to your calculation, note that jobs exceeding the maximum wall time will be killed
1. your Project ID
1. a Jobscript or interactive switch

Submit the job using the `qsub` command:

```console
$ qsub -A Project_ID -q queue -l select=x:ncpus=y,walltime=[[hh:]mm:]ss[.ms] jobscript
```

The `qsub` command submits the job to the queue, i.e. it creates a request to the PBS Job manager for allocation of specified resources. The resources will be allocated when available, subject to the above described policies and constraints. **After the resources are allocated, the jobscript or interactive shell is executed on the first of the allocated nodes.**

!!! note
    `ncpus=y` is usually not required, because the smallest allocation unit is an entire node. The exception are corner cases for `qviz` and `qfat` on Karolina.

### Job Submission Examples

```console
$ qsub -A OPEN-0-0 -q qprod -l select=64,walltime=03:00:00 ./myjob
```

In this example, we allocate 64 nodes, 36 cores per node, for 3 hours. We allocate these resources via the `qprod` queue, consumed resources will be accounted to the project identified by Project ID `OPEN-0-0`. The jobscript `myjob` will be executed on the first node in the allocation.

```console
$ qsub -q qexp -l select=4 -I
```

In this example, we allocate 4 nodes, 36 cores per node, for 1 hour. We allocate these resources via the `qexp` queue. The resources will be available interactively.

```console
$ qsub -A OPEN-0-0 -q qnvidia -l select=10 ./myjob
```

In this example, we allocate 10 NVIDIA accelerated nodes, 24 cores per node, for 24 hours. We allocate these resources via the `qnvidia` queue. The jobscript `myjob` will be executed on the first node in the allocation.

```console
$ qsub -A OPEN-0-0 -q qfree -l select=10 ./myjob
```

In this example, we allocate 10 nodes, 24 cores per node, for 12 hours. We allocate these resources via the `qfree` queue. It is not required that the project `OPEN-0-0` has any available resources left. Consumed resources are still accounted for. The jobscript `myjob` will be executed on the first node in the allocation.

All `qsub` options may be [saved directly into the jobscript][1]. In such cases, it is not necessary to specify any options for `qsub`.

```console
$ qsub ./myjob
```

By default, the PBS batch system sends an email only when the job is aborted. Disabling mail events completely can be done as follows:

```console
$ qsub -m n
```

#### Dependency Job Submission

To submit dependent jobs in sequence, use the `depend` function of `qsub`.

First submit the first job in a standard manner:

```console
$ qsub -A OPEN-0-0 -q qprod -l select=64,walltime=02:00:00 ./firstjob
123456[].isrv1
```

Then submit the second job using the `depend` function:

```console
$ qsub -W depend=afterok:123456 ./secondjob
```

Both jobs will be queued, but the second job won't start until the first job has finished successfully.

Below is the list of arguments that can be used with `-W depend=dependency:jobid`:

| Argument    | Description                                                     |
| ----------- | --------------------------------------------------------------- |
| after       | This job is scheduled after `jobid` begins execution.       |
| afterok     | This job is scheduled after `jobid` finishes successfully.  |
| afternotok  | This job is scheduled after `jobid` finishes unsucessfully. |
| afterany    | This job is scheduled after `jobid` finishes in any state.  |
| before      | This job must begin execution before `jobid` is scheduled.  |
| beforeok    | This job must finish successfully before `jobid` begins.        |
| beforenotok | This job must finish unsuccessfully before `jobid` begins.      |
| beforeany   | This job must finish in any state before `jobid` begins.        |

### Useful Tricks

All `qsub` options may be [saved directly into the jobscript][1]. In such a case, no options to `qsub` are needed.

```console
$ qsub ./myjob
```

By default, the PBS batch system sends an email only when the job is aborted. Disabling mail events completely can be done like this:

```console
$ qsub -m n
```

<!--- NOT IMPLEMENTED ON KAROLINA YET

## Advanced Job Placement

### Salomon - Placement by Network Location

The network location of allocated nodes in the [InfiniBand network][3] influences efficiency of network communication between nodes of job. Nodes on the same InfiniBand switch communicate faster with lower latency than distant nodes. To improve communication efficiency of jobs, PBS scheduler on Salomon is configured to allocate nodes (from currently available resources), which are as close as possible in the network topology.

For communication intensive jobs, it is possible to set stricter requirement - to require nodes directly connected to the same InfiniBand switch or to require nodes located in the same dimension group of the InfiniBand network.

### Salomon - Placement by InfiniBand Switch

Nodes directly connected to the same InfiniBand switch can communicate most efficiently. Using the same switch prevents hops in the network and provides for unbiased, most efficient network communication. There are 9 nodes directly connected to every InfiniBand switch.

!!! note
    We recommend allocating compute nodes of a single switch when the best possible computational network performance is required to run job efficiently.

Nodes directly connected to the one InfiniBand switch can be allocated using node grouping on the PBS resource attribute `switch`.

In this example, we request all 9 nodes directly connected to the same switch using node grouping placement.

```console
$ qsub -A OPEN-0-0 -q qprod -l select=9 -l place=group=switch ./myjob
```

-->

## Advanced Job Handling

### Selecting Turbo Boost Off

!!! note
    For Barbora only.

Intel Turbo Boost Technology is on by default. We strongly recommend keeping the default.

If necessary (such as in the case of benchmarking), you can disable Turbo for all nodes of the job by using the PBS resource attribute `cpu_turbo_boost`:

```console
$ qsub -A OPEN-0-0 -q qprod -l select=4 -l cpu_turbo_boost=0 -I
```

More information about the Intel Turbo Boost can be found in the TurboBoost section

### Advanced Examples

In the following example, we select an allocation for benchmarking a very special and demanding MPI program. We request Turbo off, and 2 full chassis of compute nodes (nodes sharing the same IB switches) for 30 minutes:

```console
$ qsub -A OPEN-0-0 -q qprod
    -l select=18:ibswitch=isw10:mpiprocs=1:ompthreads=16+18:ibswitch=isw20:mpiprocs=16:ompthreads=1
    -l cpu_turbo_boost=0,walltime=00:30:00
    -N Benchmark ./mybenchmark
```

The MPI processes will be distributed differently on the nodes connected to the two switches. On the isw10 nodes, we will run 1 MPI process per node with 16 threads per process, on isw20 nodes we will run 16 plain MPI processes.

Although this example is somewhat artificial, it demonstrates the flexibility of the qsub command options.

## Job Management

!!! note
    Check the status of your jobs using the `qstat` and `check-pbs-jobs` commands

```console
$ qstat -a
$ qstat -a -u username
$ qstat -an -u username
$ qstat -f 12345.srv11
```

Example:

```console
$ qstat -a

srv11:
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
16287.srv11 user1    qlong    job1         6183   4 64   --  144:0 R 38:25
16468.srv11 user1    qlong    job2         8060   4 64   --  144:0 R 17:44
16547.srv11 user2    qprod    job3x       13516   2 32   --  48:00 R 00:58
```

In this example user1 and user2 are running jobs named `job1`, `job2`, and `job3x`. `job1` and `job2` are using 4 nodes, 128 cores per node each. `job1` has already run for 38 hours and 25 minutes, and `job2` for 17 hours 44 minutes. So `job1`, for example, has already consumed `64 x 38.41 = 2,458.6` core-hours. `job3x` has already consumed `32 x 0.96 = 30.93` core-hours. These consumed core-hours will be [converted to node-hours][10] and accounted for on the respective project accounts, regardless of whether the allocated cores were actually used for computations.

The following commands allow you to check the status of your jobs using the `check-pbs-jobs` command, check for the presence of user's PBS jobs' processes on execution hosts, display load and processes, display job standard and error output, and continuously display (`tail -f`) job standard or error output.

```console
$ check-pbs-jobs --check-all
$ check-pbs-jobs --print-load --print-processes
$ check-pbs-jobs --print-job-out --print-job-err
$ check-pbs-jobs --jobid JOBID --check-all --print-all
$ check-pbs-jobs --jobid JOBID --tailf-job-out
```

Examples:

```console
$ check-pbs-jobs --check-all
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Check session id: OK
Check processes
cn164: OK
cn165: No process
```

In this example we see that job `35141.dm2` is not currently running any processes on the allocated node cn165, which may indicate an execution error:

```console
$ check-pbs-jobs --print-load --print-processes
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Print load
cn164: LOAD: 16.01, 16.01, 16.00
cn165: LOAD:  0.01,  0.00,  0.01
Print processes
       %CPU CMD
cn164:  0.0 -bash
cn164:  0.0 /bin/bash /var/spool/PBS/mom_priv/jobs/35141.dm2.SC
cn164: 99.7 run-task
...
```

In this example, we see that job `35141.dm2` is currently running a process run-task on node `cn164`, using one thread only, while node `cn165` is empty, which may indicate an execution error.

```console
$ check-pbs-jobs --jobid 35141.dm2 --print-job-out
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Print job standard output:
======================== Job start  ==========================
Started at    : Fri Aug 30 02:47:53 CEST 2013
Script name   : script
Run loop 1
Run loop 2
Run loop 3
```

In this example, we see the actual output (some iteration loops) of the job `35141.dm2`.

!!! note
    Manage your queued or running jobs, using the `qhold`, `qrls`, `qdel`, `qsig`, or `qalter` commands

You may release your allocation at any time, using the `qdel` command

```console
$ qdel 12345.srv11
```

You may kill a running job by force, using the `qsig` command

```console
$ qsig -s 9 12345.srv11
```

Learn more by reading the PBS man page

```console
$ man pbs_professional
```

## Job Execution

### Jobscript

!!! note
    Prepare the jobscript to run batch jobs in the PBS queue system

The Jobscript is a user made script controlling a sequence of commands for executing the calculation. It is often written in bash, though other scripts may be used as well. The jobscript is supplied to the PBS `qsub` command as an argument, and is executed by the PBS Professional workload manager.

!!! note
    The jobscript or interactive shell is executed on first of the allocated nodes.

```console
$ qsub -q qexp -l select=4 -N Name0 ./myjob
$ qstat -n -u username

srv11:
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
15209.srv11     username qexp     Name0        5530   4 128    --  01:00 R 00:00
   cn17/0*32+cn108/0*32+cn109/0*32+cn110/0*32
```

In this example, the nodes `cn17`, `cn108`, `cn109`, and `cn110` were allocated for 1 hour via the qexp queue. The `myjob` jobscript will be executed on the node `cn17`, while the nodes `cn108`, `cn109`, and `cn110` are available for use as well.

The jobscript or interactive shell is by default executed in the `/home` directory:

```console
$ qsub -q qexp -l select=4 -I
qsub: waiting for job 15210.srv11 to start
qsub: job 15210.srv11 ready

$ pwd
/home/username
```

In this example, 4 nodes were allocated interactively for 1 hour via the `qexp` queue. The interactive shell is executed in the `/home` directory.

!!! note
    All nodes within the allocation may be accessed via SSH. Unallocated nodes are not accessible to the user.

The allocated nodes are accessible via SSH from login nodes. The nodes may access each other via SSH as well.

Calculations on allocated nodes may be executed remotely via the MPI, SSH, pdsh, or clush. You may find out which nodes belong to the allocation by reading the `$PBS_NODEFILE` file

```console
$ qsub -q qexp -l select=4 -I
qsub: waiting for job 15210.srv11 to start
qsub: job 15210.srv11 ready

$ pwd
/home/username

$ sort -u $PBS_NODEFILE
cn17.bullx
cn108.bullx
cn109.bullx
cn110.bullx

$ pdsh -w cn17,cn[108-110] hostname
cn17: cn17
cn108: cn108
cn109: cn109
cn110: cn110
```

In this example, the hostname program is executed via `pdsh` from the interactive shell. The execution runs on all four allocated nodes. The same result would be achieved if the `pdsh` were called from any of the allocated nodes or from the login nodes.

### Example Jobscript for MPI Calculation

!!! note
    Production jobs must use the /scratch directory for I/O

The recommended way to run production jobs is to change to the `/scratch` directory early in the jobscript, copy all inputs to `/scratch`, execute the calculations, and copy outputs to the `/home` directory.

```bash
#!/bin/bash

cd $PBS_O_WORKDIR

SCRDIR=/scratch/project/open-00-00/${USER}/myjob
mkdir -p $SCRDIR

# change to scratch directory, exit on failure
cd $SCRDIR || exit

# copy input file to scratch
cp $PBS_O_WORKDIR/input .
cp $PBS_O_WORKDIR/mympiprog.x .

# load the MPI module
# (Always specify the module's name and version in your script;
# for the reason, see https://docs.it4i.cz/software/modules/lmod/#loading-modules.)
ml OpenMPI/4.1.1-GCC-10.2.0-Java-1.8.0_221

# execute the calculation
mpirun -pernode ./mympiprog.x

# copy output file to home
cp output $PBS_O_WORKDIR/.

#exit
exit
```

In this example, a directory in `/home` holds the input file input and the `mympiprog.x` executable. We create the `myjob` directory on the `/scratch` filesystem, copy input and executable files from the `/home` directory where the `qsub` was invoked (`$PBS_O_WORKDIR`) to `/scratch`, execute the MPI program `mympiprog.x` and copy the output file back to the `/home` directory. `mympiprog.x` is executed as one process per node, on all allocated nodes.

!!! note
    Consider preloading inputs and executables onto [shared scratch][6] memory before the calculation starts.

In some cases, it may be impractical to copy the inputs to the `/scratch` memory and the outputs to the `/home` directory. This is especially true when very large input and output files are expected, or when the files should be reused by a subsequent calculation. In such cases, it is the users' responsibility to preload the input files on the shared `/scratch` memory before the job submission, and retrieve the outputs manually after all calculations are finished.

!!! note
    Store the `qsub` options within the jobscript. Use the `mpiprocs` and `ompthreads` qsub options to control the MPI job execution.

### Example Jobscript for MPI Calculation With Preloaded Inputs

Example jobscript for an MPI job with preloaded inputs and executables, options for `qsub` are stored within the script:

```bash
#!/bin/bash
#PBS -q qprod
#PBS -N MYJOB
#PBS -l select=100:mpiprocs=1:ompthreads=16
#PBS -A OPEN-00-00

# job is run using project resources; here ${PBS_ACCOUNT,,} translates to "open-00-00"
SCRDIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/myjob

# change to scratch directory, exit on failure
cd $SCRDIR || exit

# load the MPI module
# (Always specify the module's name and version in your script;
# for the reason, see https://docs.it4i.cz/software/modules/lmod/#loading-modules.)
ml OpenMPI/4.1.1-GCC-10.2.0-Java-1.8.0_221

# execute the calculation
mpirun ./mympiprog.x

#exit
exit
```

In this example, input and executable files are assumed to be preloaded manually in the `/scratch/project/open-00-00/$USER/myjob` directory. Because we used the `qprod` queue, we had to specify which project's resources we want to use, and our `PBS_ACCOUNT` variable will be set accordingly (OPEN-00-00). `${PBS_ACCOUNT,,}` uses one of the bash's built-in functions to translate it into lower case.

Note the `mpiprocs` and `ompthreads` qsub options controlling the behavior of the MPI execution. `mympiprog.x` is executed as one process per node, on all 100 allocated nodes. If `mympiprog.x` implements OpenMP threads, it will run 16 threads per node.

### Example Jobscript for Single Node Calculation

!!! note
    The local scratch directory is often useful for single node jobs. Local scratch memory will be deleted immediately after the job ends.

Example jobscript for single node calculation, using [local scratch][6] memory on the node:

```bash
#!/bin/bash

# change to local scratch directory
cd /lscratch/$PBS_JOBID || exit

# copy input file to scratch
cp $PBS_O_WORKDIR/input .
cp $PBS_O_WORKDIR/myprog.x .

# execute the calculation
./myprog.x

# copy output file to home
cp output $PBS_O_WORKDIR/.

#exit
exit
```

In this example, a directory in `/home` holds the input file input and the executable `myprog.x`. We copy input and executable files from the `/home` directory where the `qsub` was invoked (`$PBS_O_WORKDIR`) to the local `/scratch` memory `/lscratch/$PBS_JOBID`, execute `myprog.x` and copy the output file back to the `/home directory`. `myprog.x` runs on one node only and may use threads.

### Other Jobscript Examples

Further jobscript examples may be found in the software section and the [Capacity computing][9] section.

[1]: #example-jobscript-for-mpi-calculation-with-preloaded-inputs
[2]: resources-allocation-policy.md
[3]: ../salomon/network.md
[5]: ../salomon/7d-enhanced-hypercube.md
[6]: ../salomon/storage.md
[9]: capacity-computing.md
[10]: resources-allocation-policy.md#resource-accounting-policy
Original line number Original line Diff line number Diff line
!!!warning
    This page has not been updated yet. The page does not reflect the transition from PBS to Slurm.

# Parallel Runs Setting on Karolina
# Parallel Runs Setting on Karolina


Important aspect of each parallel application is correct placement of MPI processes
Important aspect of each parallel application is correct placement of MPI processes
Original line number Original line Diff line number Diff line
---
hide:

- toc

---

# Karolina Partitions

!!! important
    Active [project membership][1] is required to run jobs.

Below is the list of partitions available on the Karolina cluster:

| Partition        | Project resources    | Nodes                                                     | Min ncpus   | Priority | Authorization | Walltime (def/max) |
| ---------------- | -------------------- | --------------------------------------------------------- | ----------- | -------- | ------------- | ------------------ |
| **qcpu**         | > 0                  | 720                                                       | 128         | 2        | no            | 24 / 48h           |
| **qcpu_biz**     | > 0                  | 720                                                       | 128         | 3        | no            | 24 / 48h           |
| **qcpu_exp**     | < 150% of allocation | 720<br>max 2 per user                                     | 128         | 4        | no            | 1 / 1h             |
| **qcpu_free**    | < 150% of allocation | 720<br>max 4 per job                                      | 128         | 1        | no            | 12 / 18h           |
| **qcpu_long**    | > 0                  | 200<br>max 20 per job, only non-accelerated nodes allowed | 128         | 2        | no            | 72 / 144h          |
| **qcpu_preempt** | active Karolina<br> CPU alloc. | 720<br>max 4 per job                                      | 128         | 0        | no            | 12 / 12h           |
| **qgpu**         | > 0                  | 72<br>max 16 per job                                      | 16<br>1 gpu | 3        | yes           | 24 / 48h           |
| **qgpu_big**     | > 0                  | 72<br>max 64 per job                                      | 128         | 2        | yes           | 12 / 12h           |
| **qgpu_biz**     | > 0                  | 72<br>max 16 per job                                      | 128         | 4        | yes           | 24 / 48h           |
| **qgpu_exp**     | < 150% of allocation | 4<br>max 1 per job                                        | 16<br>1 gpu | 5        | no            | 1 / 1h             |
| **qgpu_free**    | < 150% of allocation | 46<br>max 2 per job                                       | 16<br>1 gpu | 1        | no            | 12 / 18h           |
| **qgpu_preempt** | active Karolina<br> GPU alloc. | 72<br>max 2 per job                                       | 16<br>1 gpu | 0        | no            | 12 / 12h           |
| **qviz**         | > 0                  | 2 with NVIDIA® Quadro RTX™ 6000                           | 8           | 2        | no            | 1 / 8h             |
| **qfat**         | > 0                  | 1 (sdf1)                                                  | 24          | 2        | yes           | 24 / 48h           |

[1]: access/project-access.md
+0 −31
Original line number Original line Diff line number Diff line
# Karolina Queues

Below is the list of queues available on the Karolina cluster:

| Queue            | Active project | Project resources    | Nodes                                                         | Min ncpus | Priority | Authorization | Walltime (default/max)  |
| ---------------- | -------------- | -------------------- | ------------------------------------------------------------- | --------- | -------- | ------------- | ----------------------- |
| **qcpu**         | yes            | > 0                  | 756 nodes                                                     | 128       | 0        | no            | 24 / 48h                |
| **qcpu_biz**     | yes            | > 0                  | 756 nodes                                                     | 128       | 0       | no            | 24 / 48h                |
| **qcpu_eurohpc** | yes            | > 0                  | 756 nodes                                                     | 128       | 0       | no            | 24 / 48h                |
| **qcpu_exp**     | yes            | none required        | 756 nodes<br>max 2 nodes per user                             | 128       | 150      | no            | 1 / 1h                  |
| **qcpu_free**    | yes            | < 150% of allocation | 756 nodes<br>max 4 nodes per job                              | 128       | -100    | no            | 12 / 18h                |
| **qcpu_long**    | yes            | > 0                  | 200 nodes<br>max 20 nodes per job, only non-accelerated nodes allowed | 128 | 0        | no            | 72 / 144h               |
| **qcpu_preempt** | yes            | > 0                  | 756 nodes<br>max 4 nodes per job                              | 128       | -200     | no            | 12 / 12h                |
| **qgpu**         | yes            | > 0                  | 72 nodes                                                      | 16 cpus<br>1 gpu | 0 | yes           | 24 / 48h                |
| **qgpu_biz**     | yes            | > 0                  | 70 nodes                                                      | 128       | 0       | yes           | 24 / 48h                |
| **qgpu_eurohpc** | yes            | > 0                  | 70 nodes                                                      | 128       | 0       | yes           | 24 / 48h                |
| **qgpu_exp**     | yes            | none required        | 4 nodes<br>max 1 node per job                                 | 16 cpus<br>1 gpu| 150| no            | 1 / 1h                  |
| **qgpu_free**    | yes            | < 150% of allocation | 46 nodes<br>max 2 nodes per job                               | 16 cpus<br>1 gpu|-100| no            | 12 / 18h                |
| **qgpu_preempt** | yes            | > 0                  | 72 nodes<br>max 2 nodes per job                               | 16 cpus<br>1 gpu|-200| no            | 12 / 12h                |
| **qviz**         | yes            | none required        | 2 nodes (with NVIDIA® Quadro RTX™ 6000)                       | 8         | 0        | no            | 1 / 8h                  |
| **qfat**         | yes            | > 0                  | 1 (sdf1)                                                      | 24        | 0        | yes           | 24 / 48h                |

## Legacy Queues

| Queue            | Active project | Project resources    | Nodes                                                         | Min ncpus | Priority | Authorization | Walltime (default/max)  |
| ---------------- | -------------- | -------------------- | ------------------------------------------------------------- | --------- | -------- | ------------- | ----------------------- |
| **qfree**        | yes            | < 150% of allocation | 756 nodes<br>max 4 nodes per job                              | 128       | -100    | no            | 12 / 12h                |
| **qexp**         | no             | none required        | 756 nodes<br>max 2 nodes per job                             | 128       | 150      | no            | 1 / 1h                  |
| **qprod**        | yes            | > 0                  | 756 nodes                                                     | 128       | 0        | no            | 24 / 48h                |
| **qlong**        | yes            | > 0                  | 200 nodes<br>max 20 nodes per job, only non-accelerated nodes allowed | 128 | 0        | no            | 72 / 144h               |
| **qnvidia**      | yes            | > 0                  | 72 nodes                                                      | 128       | 0        | yes           | 24 / 48h                |
+190 −0
Original line number Original line Diff line number Diff line
# Karolina - Job Submission and Execution

## Introduction

[Slurm][1] workload manager is used to allocate and access Karolina cluster's resources.
This page describes Karolina cluster's specific Slurm settings and usage.
General information about Slurm usage at IT4Innovations can be found at [Slurm Job Submission and Execution][2].

## Partition Information

Partitions/queues on the system:

```console
$ sinfo -s
PARTITION    AVAIL  TIMELIMIT   NODES(A/I/O/T) NODELIST
qcpu*           up 2-00:00:00      1/717/0/718 cn[001-718]
qcpu_biz        up 2-00:00:00      1/717/0/718 cn[001-718]
qcpu_exp        up    1:00:00      1/719/0/720 cn[001-720]
qcpu_free       up   18:00:00      1/717/0/718 cn[001-718]
qcpu_long       up 6-00:00:00      1/717/0/718 cn[001-718]
qcpu_preempt    up   12:00:00      1/717/0/718 cn[001-718]
qgpu            up 2-00:00:00        0/70/0/70 acn[01-70]
qgpu_big        up   12:00:00        71/1/0/72 acn[01-72]
qgpu_biz        up 2-00:00:00        0/70/0/70 acn[01-70]
qgpu_exp        up    1:00:00        0/72/0/72 acn[01-72]
qgpu_free       up   18:00:00        0/70/0/70 acn[01-70]
qgpu_preempt    up   12:00:00        0/70/0/70 acn[01-70]
qfat            up 2-00:00:00          0/1/0/1 sdf1
qviz            up    8:00:00          0/2/0/2 viz[1-2]
```

For more information about Karolina's queues, see [this page][8].

Graphical representation of cluster usage, partitions, nodes, and jobs could be found
at [https://extranet.it4i.cz/rsweb/karolina][3]

On Karolina cluster

* all CPU queues/partitions provide full node allocation, whole nodes (all node resources) are allocated to a job.
* other queues/partitions (gpu, fat, viz) provide partial node allocation. Jobs' resources (cpu, mem) are separated and dedicated for job.

!!! important "Partial node allocation and security"
    Division of nodes means that if two users allocate a portion of the same node, they can see each other's running processes.
    If this solution is inconvenient for you, consider allocating a whole node.


IT4I clusters are monitored for resources utilization.
One of the monitoring daemons is using registers to collect performance
monitoring counters (PMC), which user may need when analysing performance
of the executed application (perf or [Score-P][10] profiling tools).
To deactivate the daemon and release the respective registers set job feature
during allocation, as specified [here][9].

## Using CPU Queues

Access [standard compute nodes][4].
Whole nodes are allocated. Use the `--nodes` option to specify the number of requested nodes.
There is no need to specify the number of cores and memory size.

```console
#!/usr/bin/bash
#SBATCH --job-name MyJobName
#SBATCH --account PROJECT-ID
#SBATCH --partition qcpu
#SBATCH --time 12:00:00
#SBATCH --nodes 8
...
```

## Using GPU Queues

Access [GPU accelerated nodes][5].
Every GPU accelerated node is divided into eight parts, each part contains one GPU, 16 CPU cores and corresponding memory.
By default, only one part, i.e. 1/8 of the node - one GPU and corresponding CPU cores and memory, is allocated.
There is no need to specify the number of cores and memory size, on the contrary, it is undesirable.
There are employed some restrictions which aim to provide fair division and efficient use of node resources.

```console
#!/usr/bin/bash
#SBATCH --job-name MyJobName
#SBATCH --account PROJECT-ID
#SBATCH --partition qgpu
#SBATCH --time 12:00:00
...
```

To allocate more GPUs use `--gpus` option.
The default behavior is to allocate enough nodes to satisfy the requested resources as expressed by `--gpus` option and without delaying the initiation of the job.

The following code requests one GPU. One GPU and 16 CPU cores will be allocated to the job. Up to eight jobs could run on single GPU node.

```console
#SBATCH --gpus 1
```

The following code requests four GPUs; scheduler can allocate from one up to four nodes depending on the actual cluster state (i.e. GPU availability) to fulfil the request.

```console
#SBATCH --gpus 4
```

The following code requests 16 GPUs; scheduler can allocate from two up to sixteen nodes depending on the actual cluster state (i.e. GPU availability) to fulfil the request.

```console
#SBATCH --gpus 16
```

To allocate GPUs within one node you have to specify the `--nodes` option.

The following code requests four GPUs on exactly one node

```console
#SBATCH --gpus 4
#SBATCH --nodes 1
```

The following code requests 16 GPUs on exactly two nodes.

```console
#SBATCH --gpus 16
#SBATCH --nodes 2
```

Alternatively, you can use the `--gpus-per-node` option.
Only value 8 is allowed for multi-node allocation to prevent fragmenting nodes.

The following code requests 16 GPUs on exactly two nodes.

```console
#SBATCH --gpus-per-node 8
#SBATCH --nodes 2
```

For large jobs that require more than 16 GPU nodes (equivalent to at least 128 GPUs), the "qgpu_big" queue is designated, with a limit of 64 GPU nodes (corresponding to up to 512 GPUs).


## Using Fat Queue

Access [data analytics aka fat node][6].
Fat node is divided into 32 parts, each part contains one socket/processor (24 cores) and corresponding memory.
By default, only one part, i.e. 1/32 of the node - one processor and corresponding memory, is allocated.

To allocate requested memory use the `--mem` option.
Corresponding CPUs will be allocated.
Fat node has about 22.5TB of memory available for jobs.

```console
#!/usr/bin/bash
#SBATCH --job-name MyJobName
#SBATCH --account PROJECT-ID
#SBATCH --partition qfat
#SBATCH --time 2:00:00
#SBATCH --mem 6TB
...
```

You can also specify CPU-oriented options (like `--cpus-per-task`), then appropriate memory will be allocated to the job.

To allocate a whole fat node, use the `--exclusive` option

```console
#SBATCH --exclusive
```

## Using Viz Queue

Access [visualization nodes][7].
Every visualization node is divided into eight parts.
By default, only one part, i.e. 1/8 of the node, is allocated.

```console
$ salloc -A PROJECT-ID -p qviz
```

To allocate a whole visualisation node, use the `--exclusive` option

```console
$ salloc -A PROJECT-ID -p qviz --exclusive
```

[1]: https://slurm.schedmd.com/
[2]: /general/slurm-job-submission-and-execution
[3]: https://extranet.it4i.cz/rsweb/karolina
[4]: /karolina/compute-nodes/#compute-nodes-without-accelerators
[5]: /karolina/compute-nodes/#compute-nodes-with-a-gpu-accelerator
[6]: /karolina/compute-nodes/#data-analytics-compute-node
[7]: /karolina/visualization/
[8]: ./karolina-partitions.md
[9]: /job-features/#cluster-monitoring
[10]: /software/debuggers/score-p/
 No newline at end of file
Original line number Original line Diff line number Diff line
@@ -5,7 +5,16 @@


If you are not eligible for an e-INFRA CZ account, contact the [IT4I support][a] (email: [support\[at\]it4i.cz][b]) and provide the following information:
If you are not eligible for an e-INFRA CZ account, contact the [IT4I support][a] (email: [support\[at\]it4i.cz][b]) and provide the following information:


1. Full name, country/countries of citizenship, academic affiliation, and country of affiliation
1. Personal information (**required**, note that without this information, you cannot use IT4I resources):
    1. **Full name**
    1. **Gender**
    1. **Citizenship**
    1. **Country of residence**
    1. **Organization/affiliation**
    1. **Organization/affiliation country**
    1. **Organization/affiliation type** (university, company, R&D institution, private/public sector (hospital, police), academy of sciences, etc.)
    1. **Job title** (student, PhD student, researcher, research assistant, employee, etc.)
1. Project name and/or primary investigator's (PI) name. Project name consists of project type (OPEN|DD|EU|ATR|FTA|ICA) and number in -XX-XX format, for example OPEN-33-12.
1. Statement that you have read and accepted the [Acceptable use policy document][c] (AUP)
1. Statement that you have read and accepted the [Acceptable use policy document][c] (AUP)
1. Attach the AUP file
1. Attach the AUP file
1. Your preferred username (length is limited between 4 and 7 letters)<br>The preferred username must associate with your first and last name or be otherwise derived from it. Note that the system will automatically add the `it4i-` prefix to your username.
1. Your preferred username (length is limited between 4 and 7 letters)<br>The preferred username must associate with your first and last name or be otherwise derived from it. Note that the system will automatically add the `it4i-` prefix to your username.
@@ -19,9 +28,9 @@ Subject: Access to IT4Innovations


Dear support,
Dear support,


Please open the user account for me and attach the account to OPEN-0-0
Please open the user account for me and attach the account to PROJECTNAME-XX-XX.
Personal information: John Smith, USA, Department of Chemistry, MIT, MA, US
Personal information: John Smith, USA, Department of Chemistry, MIT, MA, US.
I have read and accept the Acceptable use policy document (attached)
I have read and accept the Acceptable use policy document (attached).


Preferred username: johnsm
Preferred username: johnsm


@@ -64,7 +73,7 @@ e.g. providing sensitive information such as ID scan or user login/password.
The following example is for Actalis free S/MIME certificate, but you can choose your preferred CA.
The following example is for Actalis free S/MIME certificate, but you can choose your preferred CA.


1. Go to the [Actalis Free Email Certificate][l] request form.
1. Go to the [Actalis Free Email Certificate][l] request form.
1. Follow the instructions: fill out the form, accept the terms and conditions, and submit the request.
1. Select the free version - Mailbox Validated - and remove the €6.00 renewal item from your cart before proceeding with the order.
1. You will receive an email with the certificate.
1. You will receive an email with the certificate.
1. Import the certificate to one of the supported email clients.
1. Import the certificate to one of the supported email clients.
1. Attach a scan of photo ID (personal ID, passport, or driver license) to your email request for IT4I account.
1. Attach a scan of photo ID (personal ID, passport, or driver license) to your email request for IT4I account.
@@ -84,7 +93,7 @@ The following example is for Actalis free S/MIME certificate, but you can choose


[a]: https://support.it4i.cz/rt/
[a]: https://support.it4i.cz/rt/
[b]: mailto:support@it4i.cz
[b]: mailto:support@it4i.cz
[c]: https://www.it4i.cz/file/281883408ded04bd0961113ea33b8118/7450/AUP2022-v4-CZE-ENG.final.signed[94].pdf
[c]: https://docs.it4i.cz/general/aup/
[d]: http://support.it4i.cz/
[d]: http://support.it4i.cz/
[e]: https://scs.it4i.cz
[e]: https://scs.it4i.cz
[f]: http://www.igtf.net/
[f]: http://www.igtf.net/
@@ -93,7 +102,7 @@ The following example is for Actalis free S/MIME certificate, but you can choose
[i]: http://www.postsignum.cz/
[i]: http://www.postsignum.cz/
[j]: http://www.ica.cz/Kvalifikovany-certifikat.aspx
[j]: http://www.ica.cz/Kvalifikovany-certifikat.aspx
[k]: http://idoc.vsb.cz/xwiki/wiki/infra/view/uzivatel/moz-cert-gen
[k]: http://idoc.vsb.cz/xwiki/wiki/infra/view/uzivatel/moz-cert-gen
[l]: https://extrassl.actalis.it/portal/uapub/freemail?lang=en
[l]: https://www.actalis.com/s-mime-certificates
[r]: https://www.it4i.cz/computing-resources-allocation/?lang=en
[r]: https://www.it4i.cz/computing-resources-allocation/?lang=en
[s]: https://extranet.it4i.cz/ssp/?action=changesshkey
[s]: https://extranet.it4i.cz/ssp/?action=changesshkey
[u]: https://www.eduid.cz/
[u]: https://www.eduid.cz/
Original line number Original line Diff line number Diff line
!!!warning
    This page has not been updated yet. The page does not reflect the transition from PBS to Slurm.

# Job Submission and Execution

## Job Submission

When allocating computational resources for the job, specify:

1. a suitable queue for your job (the default is qprod)
1. the number of computational nodes (required)
1. the number of cores per node (not required)
1. the maximum wall time allocated to your calculation, note that jobs exceeding the maximum wall time will be killed
1. your Project ID
1. a Jobscript or interactive switch

Submit the job using the `qsub` command:

```console
$ qsub -A Project_ID -q queue -l select=x:ncpus=y,walltime=[[hh:]mm:]ss[.ms] jobscript
```

The `qsub` command submits the job to the queue, i.e. it creates a request to the PBS Job manager for allocation of specified resources. The resources will be allocated when available, subject to the above described policies and constraints. **After the resources are allocated, the jobscript or interactive shell is executed on the first of the allocated nodes.**

!!! note
    `ncpus=y` is usually not required, because the smallest allocation unit is an entire node. The exception are corner cases for `qviz` and `qfat` on Karolina.

### Job Submission Examples

```console
$ qsub -A OPEN-0-0 -q qprod -l select=64,walltime=03:00:00 ./myjob
```

In this example, we allocate 64 nodes, 36 cores per node, for 3 hours. We allocate these resources via the `qprod` queue, consumed resources will be accounted to the project identified by Project ID `OPEN-0-0`. The jobscript `myjob` will be executed on the first node in the allocation.

```console
$ qsub -q qexp -l select=4 -I
```

In this example, we allocate 4 nodes, 36 cores per node, for 1 hour. We allocate these resources via the `qexp` queue. The resources will be available interactively.

```console
$ qsub -A OPEN-0-0 -q qnvidia -l select=10 ./myjob
```

In this example, we allocate 10 NVIDIA accelerated nodes, 24 cores per node, for 24 hours. We allocate these resources via the `qnvidia` queue. The jobscript `myjob` will be executed on the first node in the allocation.

```console
$ qsub -A OPEN-0-0 -q qfree -l select=10 ./myjob
```

In this example, we allocate 10 nodes, 24 cores per node, for 12 hours. We allocate these resources via the `qfree` queue. It is not required that the project `OPEN-0-0` has any available resources left. Consumed resources are still accounted for. The jobscript `myjob` will be executed on the first node in the allocation.

All `qsub` options may be [saved directly into the jobscript][1]. In such cases, it is not necessary to specify any options for `qsub`.

```console
$ qsub ./myjob
```

By default, the PBS batch system sends an email only when the job is aborted. Disabling mail events completely can be done as follows:

```console
$ qsub -m n
```

#### Dependency Job Submission

To submit dependent jobs in sequence, use the `depend` function of `qsub`.

First submit the first job in a standard manner:

```console
$ qsub -A OPEN-0-0 -q qprod -l select=64,walltime=02:00:00 ./firstjob
123456[].isrv1
```

Then submit the second job using the `depend` function:

```console
$ qsub -W depend=afterok:123456 ./secondjob
```

Both jobs will be queued, but the second job won't start until the first job has finished successfully.

Below is the list of arguments that can be used with `-W depend=dependency:jobid`:

| Argument    | Description                                                     |
| ----------- | --------------------------------------------------------------- |
| after       | This job is scheduled after `jobid` begins execution.       |
| afterok     | This job is scheduled after `jobid` finishes successfully.  |
| afternotok  | This job is scheduled after `jobid` finishes unsucessfully. |
| afterany    | This job is scheduled after `jobid` finishes in any state.  |
| before      | This job must begin execution before `jobid` is scheduled.  |
| beforeok    | This job must finish successfully before `jobid` begins.        |
| beforenotok | This job must finish unsuccessfully before `jobid` begins.      |
| beforeany   | This job must finish in any state before `jobid` begins.        |

### Useful Tricks

All `qsub` options may be [saved directly into the jobscript][1]. In such a case, no options to `qsub` are needed.

```console
$ qsub ./myjob
```

By default, the PBS batch system sends an email only when the job is aborted. Disabling mail events completely can be done like this:

```console
$ qsub -m n
```

<!--- NOT IMPLEMENTED ON KAROLINA YET

## Advanced Job Placement

### Salomon - Placement by Network Location

The network location of allocated nodes in the [InfiniBand network][3] influences efficiency of network communication between nodes of job. Nodes on the same InfiniBand switch communicate faster with lower latency than distant nodes. To improve communication efficiency of jobs, PBS scheduler on Salomon is configured to allocate nodes (from currently available resources), which are as close as possible in the network topology.

For communication intensive jobs, it is possible to set stricter requirement - to require nodes directly connected to the same InfiniBand switch or to require nodes located in the same dimension group of the InfiniBand network.

### Salomon - Placement by InfiniBand Switch

Nodes directly connected to the same InfiniBand switch can communicate most efficiently. Using the same switch prevents hops in the network and provides for unbiased, most efficient network communication. There are 9 nodes directly connected to every InfiniBand switch.

!!! note
    We recommend allocating compute nodes of a single switch when the best possible computational network performance is required to run job efficiently.

Nodes directly connected to the one InfiniBand switch can be allocated using node grouping on the PBS resource attribute `switch`.

In this example, we request all 9 nodes directly connected to the same switch using node grouping placement.

```console
$ qsub -A OPEN-0-0 -q qprod -l select=9 -l place=group=switch ./myjob
```

-->

## Advanced Job Handling

### Selecting Turbo Boost Off

!!! note
    For Barbora only.

Intel Turbo Boost Technology is on by default. We strongly recommend keeping the default.

If necessary (such as in the case of benchmarking), you can disable Turbo for all nodes of the job by using the PBS resource attribute `cpu_turbo_boost`:

```console
$ qsub -A OPEN-0-0 -q qprod -l select=4 -l cpu_turbo_boost=0 -I
```

More information about the Intel Turbo Boost can be found in the TurboBoost section

### Advanced Examples

In the following example, we select an allocation for benchmarking a very special and demanding MPI program. We request Turbo off, and 2 full chassis of compute nodes (nodes sharing the same IB switches) for 30 minutes:

```console
$ qsub -A OPEN-0-0 -q qprod
    -l select=18:ibswitch=isw10:mpiprocs=1:ompthreads=16+18:ibswitch=isw20:mpiprocs=16:ompthreads=1
    -l cpu_turbo_boost=0,walltime=00:30:00
    -N Benchmark ./mybenchmark
```

The MPI processes will be distributed differently on the nodes connected to the two switches. On the isw10 nodes, we will run 1 MPI process per node with 16 threads per process, on isw20 nodes we will run 16 plain MPI processes.

Although this example is somewhat artificial, it demonstrates the flexibility of the qsub command options.

## Job Management

!!! note
    Check the status of your jobs using the `qstat` and `check-pbs-jobs` commands

```console
$ qstat -a
$ qstat -a -u username
$ qstat -an -u username
$ qstat -f 12345.srv11
```

Example:

```console
$ qstat -a

srv11:
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
16287.srv11 user1    qlong    job1         6183   4 64   --  144:0 R 38:25
16468.srv11 user1    qlong    job2         8060   4 64   --  144:0 R 17:44
16547.srv11 user2    qprod    job3x       13516   2 32   --  48:00 R 00:58
```

In this example user1 and user2 are running jobs named `job1`, `job2`, and `job3x`. `job1` and `job2` are using 4 nodes, 128 cores per node each. `job1` has already run for 38 hours and 25 minutes, and `job2` for 17 hours 44 minutes. So `job1`, for example, has already consumed `64 x 38.41 = 2,458.6` core-hours. `job3x` has already consumed `32 x 0.96 = 30.93` core-hours. These consumed core-hours will be [converted to node-hours][10] and accounted for on the respective project accounts, regardless of whether the allocated cores were actually used for computations.

The following commands allow you to check the status of your jobs using the `check-pbs-jobs` command, check for the presence of user's PBS jobs' processes on execution hosts, display load and processes, display job standard and error output, and continuously display (`tail -f`) job standard or error output.

```console
$ check-pbs-jobs --check-all
$ check-pbs-jobs --print-load --print-processes
$ check-pbs-jobs --print-job-out --print-job-err
$ check-pbs-jobs --jobid JOBID --check-all --print-all
$ check-pbs-jobs --jobid JOBID --tailf-job-out
```

Examples:

```console
$ check-pbs-jobs --check-all
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Check session id: OK
Check processes
cn164: OK
cn165: No process
```

In this example we see that job `35141.dm2` is not currently running any processes on the allocated node cn165, which may indicate an execution error:

```console
$ check-pbs-jobs --print-load --print-processes
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Print load
cn164: LOAD: 16.01, 16.01, 16.00
cn165: LOAD:  0.01,  0.00,  0.01
Print processes
       %CPU CMD
cn164:  0.0 -bash
cn164:  0.0 /bin/bash /var/spool/PBS/mom_priv/jobs/35141.dm2.SC
cn164: 99.7 run-task
...
```

In this example, we see that job `35141.dm2` is currently running a process run-task on node `cn164`, using one thread only, while node `cn165` is empty, which may indicate an execution error.

```console
$ check-pbs-jobs --jobid 35141.dm2 --print-job-out
JOB 35141.dm2, session_id 71995, user user2, nodes cn164,cn165
Print job standard output:
======================== Job start  ==========================
Started at    : Fri Aug 30 02:47:53 CEST 2013
Script name   : script
Run loop 1
Run loop 2
Run loop 3
```

In this example, we see the actual output (some iteration loops) of the job `35141.dm2`.

!!! note
    Manage your queued or running jobs, using the `qhold`, `qrls`, `qdel`, `qsig`, or `qalter` commands

You may release your allocation at any time, using the `qdel` command

```console
$ qdel 12345.srv11
```

You may kill a running job by force, using the `qsig` command

```console
$ qsig -s 9 12345.srv11
```

Learn more by reading the PBS man page

```console
$ man pbs_professional
```

## Job Execution

### Jobscript

!!! note
    Prepare the jobscript to run batch jobs in the PBS queue system

The Jobscript is a user made script controlling a sequence of commands for executing the calculation. It is often written in bash, though other scripts may be used as well. The jobscript is supplied to the PBS `qsub` command as an argument, and is executed by the PBS Professional workload manager.

!!! note
    The jobscript or interactive shell is executed on first of the allocated nodes.

```console
$ qsub -q qexp -l select=4 -N Name0 ./myjob
$ qstat -n -u username

srv11:
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
15209.srv11     username qexp     Name0        5530   4 128    --  01:00 R 00:00
   cn17/0*32+cn108/0*32+cn109/0*32+cn110/0*32
```

In this example, the nodes `cn17`, `cn108`, `cn109`, and `cn110` were allocated for 1 hour via the qexp queue. The `myjob` jobscript will be executed on the node `cn17`, while the nodes `cn108`, `cn109`, and `cn110` are available for use as well.

The jobscript or interactive shell is by default executed in the `/home` directory:

```console
$ qsub -q qexp -l select=4 -I
qsub: waiting for job 15210.srv11 to start
qsub: job 15210.srv11 ready

$ pwd
/home/username
```

In this example, 4 nodes were allocated interactively for 1 hour via the `qexp` queue. The interactive shell is executed in the `/home` directory.

!!! note
    All nodes within the allocation may be accessed via SSH. Unallocated nodes are not accessible to the user.

The allocated nodes are accessible via SSH from login nodes. The nodes may access each other via SSH as well.

Calculations on allocated nodes may be executed remotely via the MPI, SSH, pdsh, or clush. You may find out which nodes belong to the allocation by reading the `$PBS_NODEFILE` file

```console
$ qsub -q qexp -l select=4 -I
qsub: waiting for job 15210.srv11 to start
qsub: job 15210.srv11 ready

$ pwd
/home/username

$ sort -u $PBS_NODEFILE
cn17.bullx
cn108.bullx
cn109.bullx
cn110.bullx

$ pdsh -w cn17,cn[108-110] hostname
cn17: cn17
cn108: cn108
cn109: cn109
cn110: cn110
```

In this example, the hostname program is executed via `pdsh` from the interactive shell. The execution runs on all four allocated nodes. The same result would be achieved if the `pdsh` were called from any of the allocated nodes or from the login nodes.

### Example Jobscript for MPI Calculation

!!! note
    Production jobs must use the /scratch directory for I/O

The recommended way to run production jobs is to change to the `/scratch` directory early in the jobscript, copy all inputs to `/scratch`, execute the calculations, and copy outputs to the `/home` directory.

```bash
#!/bin/bash

cd $PBS_O_WORKDIR

SCRDIR=/scratch/project/open-00-00/${USER}/myjob
mkdir -p $SCRDIR

# change to scratch directory, exit on failure
cd $SCRDIR || exit

# copy input file to scratch
cp $PBS_O_WORKDIR/input .
cp $PBS_O_WORKDIR/mympiprog.x .

# load the MPI module
# (Always specify the module's name and version in your script;
# for the reason, see https://docs.it4i.cz/software/modules/lmod/#loading-modules.)
ml OpenMPI/4.1.1-GCC-10.2.0-Java-1.8.0_221

# execute the calculation
mpirun -pernode ./mympiprog.x

# copy output file to home
cp output $PBS_O_WORKDIR/.

#exit
exit
```

In this example, a directory in `/home` holds the input file input and the `mympiprog.x` executable. We create the `myjob` directory on the `/scratch` filesystem, copy input and executable files from the `/home` directory where the `qsub` was invoked (`$PBS_O_WORKDIR`) to `/scratch`, execute the MPI program `mympiprog.x` and copy the output file back to the `/home` directory. `mympiprog.x` is executed as one process per node, on all allocated nodes.

!!! note
    Consider preloading inputs and executables onto [shared scratch][6] memory before the calculation starts.

In some cases, it may be impractical to copy the inputs to the `/scratch` memory and the outputs to the `/home` directory. This is especially true when very large input and output files are expected, or when the files should be reused by a subsequent calculation. In such cases, it is the users' responsibility to preload the input files on the shared `/scratch` memory before the job submission, and retrieve the outputs manually after all calculations are finished.

!!! note
    Store the `qsub` options within the jobscript. Use the `mpiprocs` and `ompthreads` qsub options to control the MPI job execution.

### Example Jobscript for MPI Calculation With Preloaded Inputs

Example jobscript for an MPI job with preloaded inputs and executables, options for `qsub` are stored within the script:

```bash
#!/bin/bash
#PBS -q qprod
#PBS -N MYJOB
#PBS -l select=100:mpiprocs=1:ompthreads=16
#PBS -A OPEN-00-00

# job is run using project resources; here ${PBS_ACCOUNT,,} translates to "open-00-00"
SCRDIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/myjob

# change to scratch directory, exit on failure
cd $SCRDIR || exit

# load the MPI module
# (Always specify the module's name and version in your script;
# for the reason, see https://docs.it4i.cz/software/modules/lmod/#loading-modules.)
ml OpenMPI/4.1.1-GCC-10.2.0-Java-1.8.0_221

# execute the calculation
mpirun ./mympiprog.x

#exit
exit
```

In this example, input and executable files are assumed to be preloaded manually in the `/scratch/project/open-00-00/$USER/myjob` directory. Because we used the `qprod` queue, we had to specify which project's resources we want to use, and our `PBS_ACCOUNT` variable will be set accordingly (OPEN-00-00). `${PBS_ACCOUNT,,}` uses one of the bash's built-in functions to translate it into lower case.

Note the `mpiprocs` and `ompthreads` qsub options controlling the behavior of the MPI execution. `mympiprog.x` is executed as one process per node, on all 100 allocated nodes. If `mympiprog.x` implements OpenMP threads, it will run 16 threads per node.

### Example Jobscript for Single Node Calculation

!!! note
    The local scratch directory is often useful for single node jobs. Local scratch memory will be deleted immediately after the job ends.

Example jobscript for single node calculation, using [local scratch][6] memory on the node:

```bash
#!/bin/bash

# change to local scratch directory
cd /lscratch/$PBS_JOBID || exit

# copy input file to scratch
cp $PBS_O_WORKDIR/input .
cp $PBS_O_WORKDIR/myprog.x .

# execute the calculation
./myprog.x

# copy output file to home
cp output $PBS_O_WORKDIR/.

#exit
exit
```

In this example, a directory in `/home` holds the input file input and the executable `myprog.x`. We copy input and executable files from the `/home` directory where the `qsub` was invoked (`$PBS_O_WORKDIR`) to the local `/scratch` memory `/lscratch/$PBS_JOBID`, execute `myprog.x` and copy the output file back to the `/home directory`. `myprog.x` runs on one node only and may use threads.

### Other Jobscript Examples

Further jobscript examples may be found in the software section and the [Capacity computing][9] section.

[1]: #example-jobscript-for-mpi-calculation-with-preloaded-inputs
[2]: resources-allocation-policy.md
[3]: ../salomon/network.md
[5]: ../salomon/7d-enhanced-hypercube.md
[6]: ../salomon/storage.md
[9]: capacity-computing.md
[10]: resources-allocation-policy.md#resource-accounting-policy
Original line number Original line Diff line number Diff line
# Resource Accounting Policy
# Resource Accounting Policy


Starting with the 24<sup>th</sup> open access grant competition, the accounting policy has been changed from [normalized core hours (NCH)][2a] to **node-hours**. This means that it is now required to apply for node hours of the specific cluster and node type:
Starting with the 24<sup>th</sup> open access grant competition,
the accounting policy has been changed from [normalized core hours (NCH)][2a] to **node-hours (NH)**.
This means that it is now required to apply for node hours of the specific cluster and node type:


1. [Barbora CPU][3a]
1. [Barbora CPU][3a]
1. [Barbora GPU][4a]
1. [Barbora GPU][4a]
@@ -10,83 +12,29 @@ Starting with the 24<sup>th</sup> open access grant competition, the accounting
1. [Karolina GPU][8a]
1. [Karolina GPU][8a]
1. [Karolina FAT][9a]
1. [Karolina FAT][9a]


The accounting runs whenever the nodes are allocated via the PBS Pro workload manager (the `qsub` command), regardless of whether
The accounting runs whenever the nodes are allocated via the Slurm workload manager (the `sbatch`, `salloc` command),
the nodes are actually used for any calculation. The same rule applies for unspent [reservations][10a].
regardless of whether the nodes are actually used for any calculation.
The same rule applies for unspent [reservations][10a].


## Conversion Table
## Resource Accounting Formula


| Resources | Conversion for 1 node-hour |
| Resources                       | NH Consumed                  |
| ------------ | ----------------------- |
| ------------------------------- | ---------------------------- |
| Barbora CPU  | 36 core-hours           |
| Barbora All types, Karolina CPU | allocated nodes \* time      |
| Barbora GPU  | 4 GPU hours             |
| Karolina GPU                    | allocated gpus \* time / 8   |
| Barbora FAT  | 128 core-hours          |
| Karolina FAT                    | allocated cpus \* time / 768 |
| DGX-2        | 16 GPU hours            |
| Karolina VIZ                    | allocated cpus \* time / 64  |
| Karolina CPU | 128 core-hours          |
| Karolina GPU | 8 GPU hours             |
| Karolina FAT | 768 core-hours          |


## Original Resource Accounting Policy
time: duration of the Slurm job in hours


The original policy, as stated below, is still applied to projects from previous grant competitions.
!!! important "CPU/GPU resources granularity"


### Wall-Clock Core-Hours WCH
    Minimal granularity of all Barbora's partitions and Karolina's CPU partition is 1 node.
    This means that if you request, for example, 32 cores on Karolina's CPU partition,
    your job will still consume 1 NH \* time.


The wall-clock core-hours (WCH) are the basic metric of computer utilization time.
    All other Karolina's partitions (GPU, FAT, VIZ) provide partial node allocation;
1 wall-clock core-hour is defined as 1 processor core allocated for 1 hour of wall-clock time. For example, allocating a full node (i.e. 36 cores) on Barbora for 1 hour amounts to 36 wall-clock core-hours.
    i.e.: if you request 4 GPUs on Karolina, you will consume only 0.5 NH \* time.

### Normalized Core-Hours NCH

The resources subject to accounting are the normalized core-hours (NCH).
The normalized core-hours are obtained from WCH by applying a normalization factor:

$$
NCH = F*WCH
$$

All jobs are accounted in normalized core-hours, using factor F valid at the time of the execution:

| System        | F    |
| --------------| ---: |
| Karolina      | 1.00 |
| Barbora CPU   | 1.40 |
| Barbora GPU   | 4.50 |
| DGX-2         | 5.50 |

Factors are valid as of July 9, 2022.

The normalized core-hours were introduced to treat systems of different age on equal footing.
Normalized core-hour is an accounting tool to discount the legacy systems.

See examples in the [Job submission and execution][1a] section.

### Consumed Resources

Check how many core-hours have been consumed. The command `it4ifree` is available on cluster login nodes.

```console
$ it4ifree

Projects I am participating in
==============================
PID         Days left      Total    Used WCHs    Used NCHs    WCHs by me    NCHs by me     Free
----------  -----------  -------  -----------  -----------  ------------  ------------  -------
OPEN-XX-XX  323                0      5169947      5169947         50001         50001  1292555


Projects I am Primarily Investigating
=====================================
PID        Login         Used WCHs    Used NCHs
---------- ----------  -----------  -----------
OPEN-XX-XX user1            376670       376670
           user2           4793277      4793277

Legend
======
WCH   =    Wall-clock Core Hour
NCH   =    Normalized Core Hour
```

The `it4ifree` command is a part of the `it4i.portal.clients` package, located [here][pypi].


[1a]: job-submission-and-execution.md
[1a]: job-submission-and-execution.md
[2a]: #normalized-core-hours-nch
[2a]: #normalized-core-hours-nch
@@ -98,5 +46,3 @@ The `it4ifree` command is a part of the `it4i.portal.clients` package, located [
[8a]: ../../karolina/compute-nodes/#compute-nodes-with-a-gpu-accelerator
[8a]: ../../karolina/compute-nodes/#compute-nodes-with-a-gpu-accelerator
[9a]: ../../karolina/compute-nodes/#data-analytics-compute-node
[9a]: ../../karolina/compute-nodes/#data-analytics-compute-node
[10a]: resource_allocation_and_job_execution.md#resource-reservation
[10a]: resource_allocation_and_job_execution.md#resource-reservation

[pypi]: https://pypi.python.org/pypi/it4i.portal.clients
Original line number Original line Diff line number Diff line
# Resource Allocation and Job Execution
# How to Run Jobs


!!! important "Barbora migrating to Slurm"
## Job Submission and Execution
    Starting July 19. 9AM, we are migrating the Barbora's workload manager **from PBS to Slurm**.
    For more information on how to submit jobs in Slurm, see the [Slurm Job Submission and Execution][8] section.


To run a [job][1], computational resources for this particular job must be allocated. This is done via the [PBS Pro][b] job workload manager software, which distributes workloads across the supercomputer. Extensive information about PBS Pro can be found in the [PBS Pro User's Guide][2].
To run a [job][1], computational resources for this particular job must be allocated. This is done via the [Slurm][a] job workload manager software, which distributes workloads across the supercomputer.


## Resource Allocation Policy
The `sbatch` or `salloc` command creates a request to the Slurm job manager for allocation of specified resources.
The resources will be allocated when available, subject to allocation policies and constraints.
**After the resources are allocated, the jobscript or interactive shell is executed on first of the allocated nodes.**


Resources are allocated to the job in a fair-share fashion, subject to constraints set by the queue and resources available to the Project. [The Fair-share][3] ensures that individual users may consume approximately equal amount of resources per week. The resources are accessible via queues for queueing the jobs. The queues provide prioritized and exclusive access to the computational resources.
Read more on the [Job Submission and Execution][5] page.


### Resource Reservation
## Resource Allocation Policy


You can request a reservation of a specific number, range, or type of computational resources at [support@it4i.cz][d].
Resources are allocated to the job in a fair-share fashion, subject to constraints set by the queue and resources available to the Project. [The Fair-share][3] ensures that individual users may consume approximately equal amount of resources per week. The resources are accessible via queues for queueing the jobs. The queues provide prioritized and exclusive access to the computational resources.
Note that unspent reserved node-hours count towards the total computational resources used.


!!! note
!!! note
    See the queue status for [Karolina][a] or [Barbora][c].
    See the queue status for [Karolina][d] or [Barbora][e].


Read more on the [Resource Allocation Policy][4] page.
Read more on the [Resource Allocation Policy][4] page.


## Job Submission and Execution
## Resource Reservation

The `qsub` command creates a request to the PBS Job manager for allocation of specified resources. The **smallest allocation unit is an entire node**, with the exception of the `qexp` queue. The resources will be allocated when available, subject to allocation policies and constraints. **After the resources are allocated, the jobscript or interactive shell is executed on first of the allocated nodes.**

Read more on the [Job Submission and Execution][5] page.


## Capacity Computing
You can request a reservation of a specific number, range, or type of computational resources at [support@it4i.cz][c].

Note that unspent reserved node-hours count towards the total computational resources used.
!!! note
    Use Job arrays when running huge number of jobs.

Use GNU Parallel and/or Job arrays when running (many) single core jobs.

In many cases, it is useful to submit a huge (100+) number of computational jobs into the PBS queue system. A huge number of (small) jobs is one of the most effective ways to execute parallel calculations, achieving best runtime, throughput and computer utilization. In this chapter, we discuss the recommended way to run huge numbers of jobs, including **ways to run huge numbers of single core jobs**.

Read more on the [Capacity Computing][6] page.

## Vnode Allocation

The `qgpu` queue on Karolina takes advantage of the division of nodes into vnodes. Accelerated node equipped with two 64-core processors and eight GPU cards is treated as eight vnodes, each containing 16 CPU cores and 1 GPU card. Vnodes can be allocated to jobs individually –⁠ through precise definition of resource list at job submission, you may allocate varying number of resources/GPU cards according to your needs.

Red more on the [Vnode Allocation][7] page.


[1]: ../index.md#terminology-frequently-used-on-these-pages
[1]: ../index.md#terminology-frequently-used-on-these-pages
[2]: ../pbspro.md
[2]: https://slurm.schedmd.com/documentation.html
[3]: job-priority.md#fair-share-priority
[3]: job-priority.md#fair-share-priority
[4]: resources-allocation-policy.md
[4]: resources-allocation-policy.md
[5]: job-submission-and-execution.md
[5]: job-submission-and-execution.md
[6]: capacity-computing.md

[7]: vnode-allocation.md
[a]: https://slurm.schedmd.com/
[8]: slurm-job-submission-and-execution.md
[b]: https://slurm.schedmd.com/documentation.html

[c]: mailto:support@it4i.cz
[a]: https://extranet.it4i.cz/rsweb/karolina/queues
[d]: https://extranet.it4i.cz/rsweb/karolina/queues
[b]: https://www.altair.com/pbs-works/
[e]: https://extranet.it4i.cz/rsweb/barbora/queues
[c]: https://extranet.it4i.cz/rsweb/barbora/queues
[d]: mailto:support@it4i.cz
Original line number Original line Diff line number Diff line
@@ -14,36 +14,24 @@ Computational resources are subject to [accounting policy][7].


!!! important
!!! important
    Queues are divided based on a resource type: `qcpu_` for non-accelerated nodes and `qgpu_` for accelerated nodes. <br><br>
    Queues are divided based on a resource type: `qcpu_` for non-accelerated nodes and `qgpu_` for accelerated nodes. <br><br>
    On the Karolina's `qgpu` queue, **you can now allocate 1/8 of the node - 1 GPU and 16 cores**. For more information, see [Allocation of vnodes on qgpu][4].<br><br>
    EuroHPC queues are no longer available. If you are an EuroHPC user, use standard queues based on allocated/required type of resources.


### New Queues
### Queues


| <div style="width:86px">Queue</div>| Description |
| <div style="width:86px">Queue</div>| Description |
| -------------------------------- | ----------- |
| -------------------------------- | ----------- |
| `qcpu`                           | Production queue for non-accelerated nodes intended for standard production runs. Requires an active project with nonzero remaining resources. Full nodes are allocated. Identical to `qprod`. |
| `qcpu`                           | Production queue for non-accelerated nodes intended for standard production runs. Requires an active project with nonzero remaining resources. Full nodes are allocated. Identical to `qprod`. |
| `qgpu`                           | Dedicated queue for accessing the NVIDIA accelerated nodes. Requires an active project with nonzero remaining resources. It utilizes 8x NVIDIA A100 with 320GB HBM2 memory per node. The PI needs to explicitly ask support for authorization to enter the queue for all users associated with their project. **On Karolina, you can allocate 1/8 of the node - 1 GPU and 16 cores**. For more information, see [Allocation of vnodes on qgpu][4]. |
| `qgpu`                           | Dedicated queue for accessing the NVIDIA accelerated nodes. Requires an active project with nonzero remaining resources. It utilizes 8x NVIDIA A100 with 320GB HBM2 memory per node. The PI needs to explicitly ask support for authorization to enter the queue for all users associated with their project. **On Karolina, you can allocate 1/8 of the node - 1 GPU and 16 cores**. For more information, see [Karolina qgpu allocation][4]. |
| `qgpu_big`                       | Intended for big jobs (>16 nodes), queue priority is lower than production queue prority, **priority is temporarily increased every even weekend**. |
| `qcpu_biz`<br>`qgpu_biz`         | Commercial queues, slightly higher priority.                   |
| `qcpu_biz`<br>`qgpu_biz`         | Commercial queues, slightly higher priority.                   |
| `qcpu_eurohpc`<br>`qgpu_eurohpc` | EuroHPC queues, slightly higher priority, **Karolina only**.   |
| `qcpu_exp`<br>`qgpu_exp`         | Express queues for testing and running very small jobs. There are 2 nodes always reserved (w/o accelerators), max 8 nodes available per user. The nodes may be allocated on a per core basis. It is configured to run one job and accept five jobs in a queue per user. |
| `qcpu_exp`<br>`qgpu_exp`         | Express queues for testing and running very small jobs. There are 2 nodes always reserved (w/o accelerators), max 8 nodes available per user. The nodes may be allocated on a per core basis. It is configured to run one job and accept five jobs in a queue per user. |
| `qcpu_free`<br>`qgpu_free`       | Intended for utilization of free resources, after a project exhausted all its allocated resources. Note that the queue is **not free of charge**. [Normal accounting][2] applies. (Does not apply to DD projects by default. DD projects have to request for permission after exhaustion of computational resources.). Consumed resources will be accounted to the Project. Access to the queue is removed if consumed resources exceed 150% of the allocation. Full nodes are allocated. |
| `qcpu_free`<br>`qgpu_free`       | Intended for utilization of free resources, after a project exhausted all its allocated resources. Note that the queue is **not free of charge**. [Normal accounting][2] applies. Consumed resources will be accounted to the Project. Access to the queue is removed if consumed resources exceed 150% of the allocation. Full nodes are allocated. |
| `qcpu_long`       | Queues for long production runs. Require an active project with nonzero remaining resources. Only 200 nodes without acceleration may be accessed. Full nodes are allocated. |
| `qcpu_long`       | Queues for long production runs. Require an active project with nonzero remaining resources. Only 200 nodes without acceleration may be accessed. Full nodes are allocated. |
| `qcpu_preempt`<br>`qgpu_preempt` | Free queues with the lowest priority (LP). The queues require a project with allocation of the respective resource type. There is no limit on resource overdraft. Jobs are killed if other jobs with a higher priority (HP) request the nodes and there are no other nodes available. LP jobs are automatically re-queued once HP jobs finish, so **make sure your jobs are re-runnable**. |
| `qcpu_preempt`<br>`qgpu_preempt` | Free queues with the lowest priority (LP). The queues require a project with allocation of the respective resource type. There is no limit on resource overdraft. Jobs are killed if other jobs with a higher priority (HP) request the nodes and there are no other nodes available. LP jobs are automatically re-queued once HP jobs finish, so **make sure your jobs are re-runnable**. |
| `qdgx`                           | Queue for DGX-2, accessible from Barbora. |
| `qdgx`                           | Queue for DGX-2, accessible from Barbora. |
| `qfat`                           | Queue for fat node, PI must request authorization to enter the queue for all users associated to their project. |
| `qfat`                           | Queue for fat node, PI must request authorization to enter the queue for all users associated to their project. |
| `qviz`                           | Visualization queue Intended for pre-/post-processing using OpenGL accelerated graphics. Each user gets 8 cores of a CPU allocated (approx. 64 GB of RAM and 1/8 of the GPU capacity (default "chunk")). If more GPU power or RAM is required, it is recommended to allocate more chunks (with 8 cores each) up to one whole node per user. This is currently also the maximum allowed allocation per one user. One hour of work is allocated by default, the user may ask for 2 hours maximum. |
| `qviz`                           | Visualization queue Intended for pre-/post-processing using OpenGL accelerated graphics. Each user gets 8 cores of a CPU allocated (approx. 64 GB of RAM and 1/8 of the GPU capacity (default "chunk")). If more GPU power or RAM is required, it is recommended to allocate more chunks (with 8 cores each) up to one whole node per user. This is currently also the maximum allowed allocation per one user. One hour of work is allocated by default, the user may ask for 2 hours maximum. |


### Legacy Queues

Legacy queues stay in production until early 2023.

| Legacy queue | Replaced by               |
| ------------ | ------------------------- |
| `qexp`       | `qcpu_exp` & `qgpu_exp`   |
| `qprod`      | `qcpu`                    |
| `qlong`      | `qcpu_long`               |
| `nvidia`     | `qgpu` Note that unlike in new queues, only full nodes can be allocated. |
| `qfree`      | `qcpu_free` & `qgpu_free` |

See the following subsections for the list of queues:
See the following subsections for the list of queues:


* [Karolina queues][5]
* [Karolina queues][5]
@@ -51,28 +39,31 @@ See the following subsections for the list of queues:


## Queue Notes
## Queue Notes


The job wallclock time defaults to **half the maximum time**, see the table above. Longer wall time limits can be [set manually, see examples][3].
The job time limit defaults to **half the maximum time**, see the table above.
Longer time limits can be [set manually, see examples][3].


Jobs that exceed the reserved wall clock time (Req'd Time) get killed automatically. The wall clock time limit can be changed for queuing jobs (state Q) using the `qalter` command, however it cannot be changed for a running job (state R).
Jobs that exceed the reserved time limit get killed automatically.
The time limit can be changed for queuing jobs (state Q) using the `scontrol modify job` command,
however it cannot be changed for a running job.


## Queue Status
## Queue Status


!!! tip
!!! tip
    Check the status of jobs, queues and compute nodes [here][c].
    Check the status of jobs, queues and compute nodes [here][c].


![rspbs web interface](../img/barbora_cluster_usage.png)
![rsweb interface](../img/barbora_cluster_usage.png)


Display the queue status:
Display the queue status:


```console
```console
$ qstat -q
$ sinfo -s
```
```


The PBS allocation overview may also be obtained using the `rspbs` command:
The Slurm allocation overview may also be obtained using the `rsslurm` command:


```console
```console
$ rspbs
$ rsslurm
Usage: rspbs [options]
Usage: rsslurm [options]


Options:
Options:
  --version             show program's version number and exit
  --version             show program's version number and exit
@@ -93,9 +84,9 @@ Options:
[1]: job-priority.md
[1]: job-priority.md
[2]: #resource-accounting-policy
[2]: #resource-accounting-policy
[3]: job-submission-and-execution.md
[3]: job-submission-and-execution.md
[4]: ./vnode-allocation.md
[4]: karolina-slurm.md
[5]: ./karolina-queues.md
[5]: ./karolina-partitions.md
[6]: ./barbora-queues.md
[6]: ./barbora-partitions.md
[7]: ./resource-accounting.md
[7]: ./resource-accounting.md


[a]: https://support.it4i.cz/rt/
[a]: https://support.it4i.cz/rt/
Original line number Original line Diff line number Diff line
---
hide:

- toc

---

# Slurm Batch Jobs Examples

Below is an excerpt from the [2024 e-INFRA CZ conference][1]
describing best practices for Slurm batch calculations and data managing, including examples, by Ondrej Meca.

![PDF presentation on Slurm Batch Jobs Examples](../src/srun_karolina.pdf){ type=application/pdf style="min-height:100vh;width:100%" }

[1]: https://www.e-infra.cz/en/e-infra-cz-conference
 No newline at end of file

File changed.

Preview size limit exceeded, changes collapsed.

+23 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/pbspro.md

deleted100644 → 0
+0 −11

File deleted.

Preview size limit exceeded, changes collapsed.

File changed.

Preview size limit exceeded, changes collapsed.

+83 −24

File changed.

Preview size limit exceeded, changes collapsed.

File changed.

Preview size limit exceeded, changes collapsed.

scripts/maketitle.py

0 → 100644
+81 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/meta-json.sh

0 → 100644
+11 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/movefiles.sh

0 → 100644
+12 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/movepublic.sh

0 → 100644
+10 −0

File added.

Preview size limit exceeded, changes collapsed.

+35 −47

File changed.

Preview size limit exceeded, changes collapsed.

scripts/url_test.py

0 → 100644
+37 −0

File added.

Preview size limit exceeded, changes collapsed.