Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • chat
  • kru0052-master-patch-91081
  • lifecycles
  • master
  • 20180621-before_revision
  • 20180621-revision
6 results

Target

Select target project
  • sccs/docs.it4i.cz
  • soj0018/docs.it4i.cz
  • lszustak/docs.it4i.cz
  • jarosjir/docs.it4i.cz
  • strakpe/docs.it4i.cz
  • beranekj/docs.it4i.cz
  • tab0039/docs.it4i.cz
  • davidciz/docs.it4i.cz
  • gui0013/docs.it4i.cz
  • mrazek/docs.it4i.cz
  • lriha/docs.it4i.cz
  • it4i-vhapla/docs.it4i.cz
  • hol0598/docs.it4i.cz
  • sccs/docs-it-4-i-cz-fumadocs
  • siw019/docs-it-4-i-cz-fumadocs
15 results
Select Git revision
  • DeepDock
  • complimentary-complementary-systems
  • dice-patch
  • gui0013-DeepDock-patch-62494
  • master
  • vnode-allocation
  • 20180621-before_revision
  • 20180621-revision
8 results
Show changes
Commits on Source (908)
Showing
with 1244 additions and 132 deletions
......@@ -12,7 +12,7 @@ docs:
image: it4innovations/docker-mdcheck:latest
allow_failure: true
script:
- mdl -r ~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046 *.md docs.it4i # BUGS
- find content/docs -name "*.mdx" | xargs mdl -r ~MD002,~MD007,~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046
pylint:
stage: test
......@@ -22,20 +22,16 @@ pylint:
script:
- pylint $(find . -name "*.py" -not -name "feslicescript.py")
pysafety:
capitalize:
stage: test
image: it4innovations/docker-pycheck:latest
image: it4innovations/docker-mkdocscheck:latest
allow_failure: true
before_script:
- source /opt/.venv3/bin/activate
- python -V # debug
- pip list | grep titlecase
script:
- cat requirements.txt | safety check --stdin --full-report
capitalize:
stage: test
image: it4innovations/docker-mkdocscheck:latest
script:
- find mkdocs.yml docs.it4i/ \( -name '*.md' -o -name '*.yml' \) -print0 | xargs -0 -n1 scripts/titlemd.py --test
- find content/docs/ \( -name '*.mdx' -o -name '*.yml' \) ! -path '*einfracz*' -print0 | xargs -0 -n1 scripts/titlemd.py --test
ext_links:
stage: after_test
......@@ -45,7 +41,7 @@ ext_links:
# remove JSON results
- rm *.json
script:
- find docs.it4i/ -name '*.md' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
- find content/docs -name '*.mdx' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
only:
- master
......@@ -55,8 +51,8 @@ ext_links:
before_script:
- echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
- wget -V
- echo https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
- wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/ || true
- echo https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
- wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/ || true
script:
- cat wget.log | awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'
......@@ -66,6 +62,7 @@ mkdocs:
before_script:
- source /opt/.venv3/bin/activate
- python -V # debug
- pip install -r requirements.txt
- pip freeze # debug
- mkdocs -V # debug
script:
......@@ -74,9 +71,9 @@ mkdocs:
# get modules list from clusters
- bash scripts/get_modules.sh
# generate site_url
- (if [ "${CI_BUILD_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_BUILD_REF_NAME\//" mkdocs.yml;fi);
- (if [ "${CI_COMMIT_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_COMMIT_REF_NAME\//" mkdocs.yml;fi);
# generate ULT for code link
- sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
# - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
# regenerate modules matrix
- python scripts/modules_matrix.py > docs.it4i/modules-matrix.md
- python scripts/modules_matrix.py --json > docs.it4i/modules-matrix.json
......@@ -112,7 +109,7 @@ deploy to stage:
- echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
script:
- chown nginx:nginx site -R
- rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
- rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
only:
- branches@sccs/docs.it4i.cz
......
Quantum Scalar I6
JAN
LUMI
AI
CI/CD
AWS
CLI
FAQ
s3cmd
GUI
EESSI
hipBlas
hipSolver
LUMI
apptainer
ROCm
HIP
NVIDIA DGX-2
nvidia
smi
......@@ -816,3 +833,19 @@ e-INFRA CZ
DICE
qgpu
qcpu
it4i-portal-clients
it4icheckaccess
it4idedicatedtime
it4ifree
it4ifsusage
it4iuserfsusage
it4iprojectfsusage
it4imotd
e-INFRA
it4i-portal-clients
s3cmd
s5cmd
title:
e-INFRA CZ Cloud Ostrava
e-INFRA CZ Account
# User documentation
test
# IT4Inovations Documentation
This project contains IT4Innovations user documentation source.
## Development
### Install
```console
$ sudo apt install libpython-dev
$ virtualenv venv
$ source venv/bin/activate
$ pip install -r requirements.txt
```
### Package upgrade with pip
```console
$ pip list -o
$ pip install --upgrade package
$ pip freeze | sed '/pkg-resources==/d' > requirements.txt
```
## Environments
* [https://docs.it4i.cz - master branch](https://docs.it4i.cz - master branch)
* [https://docs.it4i.cz/devel/$BRANCH_NAME](https://docs.it4i.cz/devel/$BRANCH_NAME) - maps the branches, available only with VPN access
## URLs
* [http://facelessuser.github.io/pymdown-extensions/](http://facelessuser.github.io/pymdown-extensions/)
* [http://squidfunk.github.io/mkdocs-material/](http://squidfunk.github.io/mkdocs-material/)
```
fair-share
InfiniBand
RedHat
CentOS
Mellanox
```
## Mathematical Formulae
### Formulas are made with:
* [https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/](https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/)
* [https://www.mathjax.org/](https://www.mathjax.org/)
You can add formula to page like this:
```
$$
MAX\_FAIRSHARE * ( 1 - \frac{usage_{Project}}{usage_{Total}} )
$$
```
## Migration
To enable the MathJX on page you need to enable it by adding line ```---8<--- "mathjax.md"``` at the end of file.
* [fumadocs](https://fumadocs.vercel.app/)
\ No newline at end of file
# SCS API v2
## Info
- **OpenAPI:** 3.1.0
- **Title:** scs-api-2
- **Version:** 0.1.0
- **Server URL:** `https://scs.it4i.cz/api/v2`
## Paths
### `/dedicated-time`
**GET**
- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries, optionally filtered by cluster name or period preset
- **OperationId:** `dedicated_time_handler`
**Parameters:**
- `cluster` (query): Filter by cluster name; Available values: karolina, barbora, dgx *(optional)*
- `period` (query): Filter by time period preset; Available values: planned, active *(optional)*
**Responses:**
- `200`: List of dedicated time entries
- `400`: Failed to deserialize query, Invalid cluster, Invalid period
Example:
```json
{
"message": "Invalid cluster: el_gordo"
}
```
- `500`: Failed to retrieve dedicated time due to a server error
Example:
```json
{
"message": "Failed to retreive dedicated time"
}
```
### `/dedicated-time-calendar`
**GET**
- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries and generates a VCalendar response.
- **OperationId:** `dedicated_time_calendar`
**Responses:**
- `200`: Dedicated time VCalendar
Example:
```
BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//SUTD Timetable Calendar//randName//EN
CALSCALE:GREGORIAN
BEGIN:VEVENT
UID:1234@example.com
DTSTAMP:20230101T000000Z
DTSTART:20230101T000000Z
DTEND:20230102T000000Z
SUMMARY:Sample Dedicated Time - Cluster Outage
DESCRIPTION:Sample Dedicated Time - Cluster Outage
END:VEVENT
END:VCALENDAR
```
- `500`: Failed to retrieve dedicated time calendar
Example:
```json
{
"message": "Failed to retreive dedicated time calendar"
}
```
### `/motd`
**GET**
- **Summary:** Get messages of the day
- **Description:** Retrieves messages of the day, optionally filtered by category
- **OperationId:** `motd`
**Parameters:**
- `category` (query): *(optional)*
**Responses:**
- `200`: List of motd entries
- `400`: Failed to deserialize query, Invalid motd category
- `500`: Failed to retrieve motd entries due to a server error
Example:
```json
{
"message": "Failed to retrieve motd"
}
```
## Components
### Schemas
#### DedicatedTime
```yaml
type: object
required:
- updated_at
properties:
cluster_type:
type: [string, 'null']
date_efficiency:
type: [string, 'null']
format: date-time
date_expiration:
type: [string, 'null']
format: date-time
updated_at:
type: string
format: date-time
```
#### Motd
```yaml
type: object
required:
- id
- author
- category
- created_at
- updated_at
- date_modification
- title
- message_body
- systems
properties:
id:
type: integer
format: int32
examples: [1]
author:
type: string
examples: [Admin]
category:
type: string
examples: [public-service-announcement]
created_at:
type: string
format: date-time
updated_at:
type: string
format: date-time
date_modification:
type: string
format: date-time
date_efficiency:
type: [string, 'null']
format: date-time
date_expiration:
type: [string, 'null']
format: date-time
date_outage_efficiency:
type: [string, 'null']
format: date-time
date_outage_expiration:
type: [string, 'null']
format: date-time
title:
type: string
examples: [Important Update]
message_body:
type: string
examples: [We are experiencing some service disruptions.]
systems:
type: array
items:
type: string
examples: [Karolina]
```
#### MsgResponse
```yaml
type: object
description: |
Common struct for DTO-less responses
eg. ```200 {"message":"Operation succeeded"}```
required:
- message
properties:
message:
type: string
examples: [API response]
```
# Introduction
This section contains documentation of decommissioned IT4Innovations' supercomputers.
This section contains documentation of decommissioned IT4Innovations' supercomputers and services.
## Salomon
......@@ -42,4 +42,9 @@ At the end of January 2021, after more than seven years, its operation permanent
| Corehours used | 134,130,309 |
| Power consumption | 77 kW |
## PRACE
Partnership for Advanced Computing in Europe aims to facilitate the access to a research infrastructure that enables high-impact scientific discovery and engineering research and development across all disciplines to enhance European competitiveness for the benefit of society. For more information, see the [official website][b].
[a]: https://www.dolnivitkovice.cz/en/science-and-technology-centre/exhibitions/
[b]: https://prace-ri.eu/
# Hardware Overview
!!!important Work in progress
Barbora NG documentation is a WIP.
The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.
The launch of Barbora NG is planned for October/November.
In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.
Barbora NG consists of 141 non-accelerated compute nodes named **cn[?-???]**.
Each node is a powerful x86-64 computer equipped with 192 cores
(2x Intel Xeon 6952P with 96 CPU cores) and 768 GB RAM.
User access to the Barbora NG cluster is provided by two login nodes **login[1-2]**.
The nodes are interlinked through high speed InfiniBand NDR and Ethernet networks.
The parameters are summarized in the following tables:
| **In general** | |
| ------------------------------------ | --------------------- |
| Architecture of compute nodes | x86-64 |
| Operating system | Linux |
| [**Compute nodes**][1] | |
| Total | 141 |
| Processor Type | [Intel Xeon 6952P][b] |
| Architecture | Granite Rapids |
| Processor cores | 96 |
| Processors per node | 2 |
| RAM | 768 GB |
| Local disk drive | no |
| Compute network | InfiniBand HDR |
| non-accelerated | 141, cn[?-???] |
| **In total** | |
| Theoretical peak performance (Rpeak) | ??? TFLOP/s |
| Cores | 27072 |
| RAM | 108.288 TB |
[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
[3]: network.md
[4]: storage.md
[5]: ../general/shell-and-data-access.md
[6]: visualization.md
[a]: https://support.it4i.cz/rt
[b]: https://www.intel.com/content/www/us/en/products/sku/241643/intel-xeon-6952p-processor-480m-cache-2-10-ghz/specifications.html
\ No newline at end of file
# Introduction
!!!important Work in progress
Barbora NG documentation is a WIP.
The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.
The launch of Barbora NG is planned for October/November.
In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.
Welcome to Barbora Next Gen (NG) supercomputer cluster.
Barbora NG is our latest supercomputer which consists of 141 compute nodes,
totaling 27072 compute cores with 108288 GB RAM, giving over ??? TFLOP/s theoretical peak performance.
Nodes are interconnected through a fully non-blocking fat-tree InfiniBand NDR network
and are equipped with Intel Granite Rapids processors.
Read more in [Hardware Overview][1].
The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains.
These packages are accessible via the [modules environment][2].
The user data shared file system and job data shared file system are available to users.
The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].
Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].
[1]: hardware-overview.md
[2]: ../environment-and-modules.md
[3]: ../general/resources-allocation-policy.md
[4]: ../general/applying-for-resources.md
[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[6]: ../general/shell-and-data-access.md
[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://slurm.schedmd.com/
# Compute Nodes
Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Computing technology. The cluster contains three types of compute nodes.
Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Computing technology.
The cluster contains three types of compute nodes.
## Compute Nodes Without Accelerators
* 192 nodes
* 6912 cores in total
* 2x Intel Cascade Lake 6240, 18-core, 2.6 GHz processors per node
* 192 GB DDR4 2933MT/s of physical memory per node (12x 16 GB)
* 192 GB DDR4 2933 MT/s of physical memory per node (12x16 GB)
* BullSequana X1120 blade servers
* 2995,2 GFLOP/s per compute node
* 2995.2 GFLOP/s per compute node
* 1x 1 GB Ethernet
* 1x HDR100 IB port
* 3 computes nodes per X1120 blade server
* 3 compute nodes per X1120 blade server
* cn[1-192]
![](img/BullSequanaX1120.png)
......@@ -22,11 +23,11 @@ Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Comp
* 8 nodes
* 192 cores in total
* two Intel Skylake Gold 6126, 12-core, 2.6 GHz processors per node
* 192 GB DDR4 2933MT/s with ECC of physical memory per node (12x 16 GB)
* 192 GB DDR4 2933MT/s with ECC of physical memory per node (12x16 GB)
* 4x GPU accelerator NVIDIA Tesla V100-SXM2 per node
* Bullsequana X410-E5 NVLink-V blade servers
* 1996,8 GFLOP/s per compute nodes
* GPU-tp-GPU All-to-All NVLINK 2.0, GPU-Direct
* 1996.8 GFLOP/s per compute nodes
* GPU-to-GPU All-to-All NVLINK 2.0, GPU-Direct
* 1 GB Ethernet
* 2x HDR100 IB ports
* cn[193-200]
......@@ -37,8 +38,8 @@ Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Comp
* 1x BullSequana X808 server
* 128 cores in total
* 8 Intel Skylake 8153, 16-core, 2.0 GHz, 125W
* 6144 GiB DDR4 2667MT/s of physical memory per node (92x 64 GB)
* 8 Intel Skylake 8153, 16-core, 2.0 GHz, 125 W
* 6144 GiB DDR4 2667 MT/s of physical memory per node (92x64 GB)
* 2x HDR100 IB port
* 8192 GFLOP/s
* cn[201]
......@@ -47,19 +48,21 @@ Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Comp
## Compute Node Summary
| Node type | Count | Range | Memory | Cores | Queues |
| ---------------------------- | ----- | ----------- | ------ | ----------- | -------------------------- |
| Nodes without an accelerator | 189 | cn[1-189] | 192GB | 36 @ 2.6 GHz | qexp, qprod, qlong, qfree |
| Nodes with a GPU accelerator | 8 | cn[190-197] | 192GB | 24 @ 2.6 GHz | qnvidia |
| Fat compute nodes | 1 | cn[198] | 6144GiB | 128 @ 2.0 GHz | qfat |
| Node type | Count | Range | Memory | Cores |
| ---------------------------- | ----- | ----------- | -------- | ------------- |
| Nodes without an accelerator | 192 | cn[1-192] | 192 GB | 36 @ 2.6 GHz |
| Nodes with a GPU accelerator | 8 | cn[193-200] | 192 GB | 24 @ 2.6 GHz |
| Fat compute nodes | 1 | cn[201] | 6144 GiB | 128 @ 2.0 GHz |
## Processor Architecture
Barbora is equipped with Intel Cascade Lake processors Intel Xeon 6240 (nodes without accelerators), Intel Skylake Gold 6126 (nodes with accelerators) and Intel Skylake Platinum 8153.
Barbora is equipped with Intel Cascade Lake processors Intel Xeon 6240 (nodes without accelerators),
Intel Skylake Gold 6126 (nodes with accelerators) and Intel Skylake Platinum 8153.
### Intel [Cascade Lake 6240][d]
Cascade Lake core is largely identical to that of [Skylake's][a]. For in-depth detail of the Skylake core/pipeline see [Skylake (client) § Pipeline][b].
Cascade Lake core is largely identical to that of [Skylake's][a].
For in-depth detail of the Skylake core/pipeline see [Skylake (client) § Pipeline][b].
Xeon Gold 6240 is a 64-bit 18-core x86 multi-socket high performance server microprocessor set to be introduced by Intel in late 2018. This chip supports up to 4-way multiprocessing. The Gold 6240, which is based on the Cascade Lake microarchitecture and is manufactured on a 14 nm process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2.6 GHz with a TDP of 150 W and a turbo boost frequency of up to 3.9 GHz, supports up 1 TB of hexa-channel DDR4-2933 ECC memory.
......@@ -116,23 +119,23 @@ Barbora is equipped with an [NVIDIA Tesla V100-SXM2][g] accelerator.
![](img/gpu-v100.png)
|NVIDIA Tesla V100-SXM2||
| --- | --- |
| GPU Architecture | NVIDIA Volta |
| NVIDIA Tensor| Cores: 640 |
| NVIDIA CUDA® Cores | 5 120 |
| Double-Precision Performance | 7.8 TFLOP/s |
| Single-Precision Performance | 15.7 TFLOP/s |
| Tensor Performance | 125 TFLOP/s |
| GPU Memory | 16 GB HBM2 |
| Memory Bandwidth | 900 GB/sec |
| ECC | Yes |
| Interconnect Bandwidth | 300 GB/sec |
| System Interface | NVIDIA NVLink |
| Form Factor | SXM2 |
| Max Power Consumption | 300 W |
| Thermal Solution | Passive |
| Compute APIs | CUDA, DirectCompute,OpenCLTM, OpenACC |
| NVIDIA Tesla V100-SXM2 | |
| ---------------------------- | -------------------------------------- |
| GPU Architecture | NVIDIA Volta |
| NVIDIA Tensor Cores | 640 |
| NVIDIA CUDA® Cores | 5120 |
| Double-Precision Performance | 7.8 TFLOP/s |
| Single-Precision Performance | 15.7 TFLOP/s |
| Tensor Performance | 125 TFLOP/s |
| GPU Memory | 16 GB HBM2 |
| Memory Bandwidth | 900 GB/sec |
| ECC | Yes |
| Interconnect Bandwidth | 300 GB/sec |
| System Interface | NVIDIA NVLink |
| Form Factor | SXM2 |
| Max Power Consumption | 300 W |
| Thermal Solution | Passive |
| Compute APIs | CUDA, DirectCompute, OpenCLTM, OpenACC |
[a]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)#Core
[b]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Pipeline
......
# Hardware Overview
The Barbora cluster consists of 201 computational nodes named **cn[1-201]** of which 192 are regular compute nodes, 8 are GPU Tesla V100 accelerated nodes and 1 is a fat node. Each node is a powerful x86-64 computer, equipped with 36/24/128 cores (18-core Intel Cascade Lake 6240 / 12-core Intel Skylake Gold 6126 / 16-core Intel Skylake 8153), at least 192 GB of RAM. User access to the Barbora cluster is provided by two login nodes **login[1,2]**. The nodes are interlinked through high speed InfiniBand and Ethernet networks.
The Barbora cluster consists of 201 computational nodes named **cn[001-201]**
of which 192 are regular compute nodes, 8 are GPU Tesla V100 accelerated nodes and 1 is a fat node.
Each node is a powerful x86-64 computer, equipped with 36/24/128 cores
(18-core Intel Cascade Lake 6240 / 12-core Intel Skylake Gold 6126 / 16-core Intel Skylake 8153), at least 192 GB of RAM.
User access to the Barbora cluster is provided by two login nodes **login[1,2]**.
The nodes are interlinked through high speed InfiniBand and Ethernet networks.
The Fat node is equipped with a large amount (6144 GB) of memory. Virtualization infrastructure provides resources to run long-term servers and services in virtual mode. The Accelerated nodes, Fat node, and Virtualization infrastructure are available [upon request][a] from a PI.
The fat node is equipped with 6144 GB of memory.
Virtualization infrastructure provides resources for running long-term servers and services in virtual mode.
The Accelerated nodes, fat node, and virtualization infrastructure are available [upon request][a] from a PI.
**There are three types of compute nodes:**
......@@ -10,14 +17,17 @@ The Fat node is equipped with a large amount (6144 GB) of memory. Virtualization
* 8 compute nodes with a GPU accelerator - 4x NVIDIA Tesla V100-SXM2
* 1 fat node - equipped with 6144 GB of RAM
[More about Compute nodes][1].
[More about compute nodes][1].
GPU and accelerated nodes are available upon request, see the [Resources Allocation Policy][2].
All of these nodes are interconnected through fast InfiniBand and Ethernet networks. [More about the Network][3].
Every chassis provides an InfiniBand switch, marked **isw**, connecting all nodes in the chassis, as well as connecting the chassis to the upper level switches.
All of these nodes are interconnected through fast InfiniBand and Ethernet networks.
[More about the computing network][3].
Every chassis provides an InfiniBand switch, marked **isw**, connecting all nodes in the chassis,
as well as connecting the chassis to the upper level switches.
User access to Barbora is provided by two login nodes: login1 and login2. [More about accessing the cluster][5].
User access to Barbora is provided by two login nodes: login1 and login2.
[More about accessing the cluster][5].
The parameters are summarized in the following tables:
......@@ -32,20 +42,20 @@ The parameters are summarized in the following tables:
| RAM | min. 192 GB |
| Local disk drive | no |
| Compute network | InfiniBand HDR |
| w/o accelerator | 192, cn[1-192] |
| GPU accelerated | 8, cn[194-200] |
| w/o accelerator | 192, cn[001-192] |
| GPU accelerated | 8, cn[193-200] |
| Fat compute nodes | 1, cn[201] |
| **In total** | |
| **In total** | |
| Total theoretical peak performance (Rpeak) | 848.8448 TFLOP/s |
| Total amount of RAM | 44.544 TB |
| Node | Processor | Memory | Accelerator |
| ---------------- | --------------------------------------- | ------ | ---------------------- |
| w/o accelerator | 2 x Intel Cascade Lake 6240, 2.6 GHz | 192 GB | - |
| GPU accelerated | 2 x Intel Skylake Gold 6126, 2.6 GHz | 192 GB | NVIDIA Tesla V100-SXM2 |
| Fat compute node | 2 x Intel Skylake Platinum 8153, 2.0 GHz | 6144 GB | - |
| Node | Processor | Memory | Accelerator |
| ---------------- | --------------------------------------- | ------ | ---------------------- |
| Regular node | 2x Intel Cascade Lake 6240, 2.6 GHz | 192GB | - |
| GPU accelerated | 2x Intel Skylake Gold 6126, 2.6 GHz | 192GB | NVIDIA Tesla V100-SXM2 |
| Fat compute node | 2x Intel Skylake Platinum 8153, 2.0 GHz | 6144GB | - |
For more details refer to [Compute nodes][1], [Storage][4], [Visualization servers][6], and [Network][3].
For more details refer to the sections [Compute Nodes][1], [Storage][4], [Visualization Servers][6], and [Network][3].
[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
......
......@@ -2,13 +2,13 @@
Welcome to Barbora supercomputer cluster. The Barbora cluster consists of 201 compute nodes, totaling 7232 compute cores with 44544 GB RAM, giving over 848 TFLOP/s theoretical peak performance.
Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network, and are equipped with Intel Cascade Lake processors. A few nodes are also equipped with NVIDIA Tesla V100-SXM2 Read more in [Hardware Overview][1].
Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network, and are equipped with Intel Cascade Lake processors. A few nodes are also equipped with NVIDIA Tesla V100-SXM2. Read more in [Hardware Overview][1].
The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains. These packages are accessible via the [modules environment][2].
The user data shared file-system and job data shared file-system are available to users.
The user data shared file system and job data shared file system are available to users.
The [PBS Professional Open Source Project][b] workload manager provides [computing resources allocations and job execution][3].
The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].
Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].
......@@ -22,4 +22,4 @@ Read more on how to [apply for resources][4], [obtain login credentials][5] and
[6]: ../general/shell-and-data-access.md
[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://www.pbspro.org/
[b]: https://slurm.schedmd.com/
......@@ -2,9 +2,11 @@
All of the compute and login nodes of Barbora are interconnected through a [InfiniBand][a] HDR 200 Gbps network and a Gigabit Ethernet network.
Compute nodes and the service infrastructure is connected by the HDR100 technology that allows one 200Gbps HDR port (aggregation 4x 50Gbps) to be divided into two HDR100 ports with 100Gbps (2x 50Gbps) bandwidth.
Compute nodes and the service infrastructure is connected by the HDR100 technology
that allows one 200 Gbps HDR port (aggregation 4x 50 Gbps) to be divided into two HDR100 ports with 100 Gbps (2x 50 Gbps) bandwidth.
The cabling between the L1 and L2 layer is realized by HDR cabling, connecting the end devices is realized by so called Y or splitter cable (1x HRD200 - 2x HDR100).
The cabling between the L1 and L2 layer is realized by HDR cabling,
connecting the end devices is realized by so called Y or splitter cable (1x HRD200 - 2x HDR100).
![](img/hdr.jpg)
......@@ -21,9 +23,9 @@ The cabling between the L1 and L2 layer is realized by HDR cabling, connecting t
**Performance**
* 40x HDR 200Gb/s ports in a 1U switch
* 80x HDR100 100Gb/s ports in a 1U switch
* 16Tb/s aggregate switch throughput
* 40x HDR 200 Gb/s ports in a 1U switch
* 80x HDR100 100 Gb/s ports in a 1U switch
* 16 Tb/s aggregate switch throughput
* Up to 15.8 billion messages-per-second
* 90ns switch latency
......
......@@ -120,7 +120,7 @@ The filesystem is backed up, so that it can be restored in case of a catastrophi
The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.
The SCRATCH filesystem is mounted in directory /scratch. Users may freely create subdirectories and files on the filesystem. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.
The SCRATCH filesystem is mounted in the `/scratch/project/PROJECT_ID` directory created automatically with the `PROJECT_ID` project. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.
!!! note
The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.
......
......@@ -4,8 +4,8 @@ Remote visualization with [VirtualGL][3] is available on two nodes.
* 2 nodes
* 32 cores in total
* 2x Intel Skylake Gold 6130 – 16core@2,1GHz processors per node
* 192 GB DDR4 2667MT/s of physical memory per node (12x 16 GB)
* 2x Intel Skylake Gold 6130 – 16-core@2,1 GHz processors per node
* 192 GB DDR4 2667 MT/s of physical memory per node (12x 16 GB)
* BullSequana X450-E5 blade servers
* 2150.4 GFLOP/s per compute node
* 1x 1 GB Ethernet and 2x 10 GB Ethernet
......
# e-INFRA CZ Cloud Ostrava
Ostrava cloud consists of 22 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.
## Access
To acces the cloud you must:
* have an [e-Infra CZ account][3],
* be a member of an [active project][b].
The dashboard is available at [https://ostrava.openstack.cloud.e-infra.cz/][6].
You can specify resources/quotas for your project.
For more information, see the [Quota Limits][5] section.
## Creating First Instance
To create your first VM instance, follow the [e-INFRA CZ guide][4].
Note that the guide is similar for clouds in Brno and Ostrava,
so make sure that you follow steps for Ostrava cloud where applicable.
### Process Automatization
You can automate the process using Terraform or Openstack.
#### Terraform
Prerequisites:
* Linux/Mac/WSL terminal BASH shell
* installed Terraform and sshuttle
* downloaded [application credentials][9] from OpenStack Horizon dashboard and saved as a `project_openrc.sh.inc` text file
Follow the guide: [https://code.it4i.cz/terraform][8]
#### OpenStack
Prerequisites:
* Linux/Mac/WSL terminal BASH shell
* installed [OpenStack client][7]
Follow the guide: [https://code.it4i.cz/commandline][10]
Run commands:
```console
source project_openrc.sh.inc
```
```console
./cmdline-demo.sh basic-infrastructure-1
```
## Technical Reference
For the list of deployed OpenStack services, see the [list of components][1].
More information can be found on the [e-INFRA CZ website][2].
[1]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/openstack-components
[2]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site
[3]: https://docs.account.e-infra.cz/en/docs/access/account#how-to-apply-for-the-first-time
[4]: https://docs.platforms.cloud.e-infra.cz/en/docs/getting-started/creating-first-infrastructure
[5]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/quota-limits
[6]: https://ostrava.openstack.cloud.e-infra.cz/
[7]: https://cyso.cloud/docs/cloud/extra/how-to-use-the-openstack-cli-tools-on-linux/
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.platforms.cloud.e-infra.cz/en/docs/how-to-guides/obtaining-api-key
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
[a]: ../karolina/introduction.md
[b]: ../general/access/project-access.md
# IT4I Cloud
IT4I cloud consists of 14 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.
!!! Note
The guide describes steps for personal projects.<br>
Some steps may differ for large projects.<br>
For large project, apply for resources to the [Allocation Committee][11].
## Access
To access the cloud you must be a member of an active EUROHPC project,
or fall into the **Access Category B**, i.e. [Access For Thematic HPC Resource Utilisation][11].
A personal OpenStack project is required. Request one by contacting [IT4I Support][12].
The dashboard is available at [https://cloud.it4i.cz][6].
You can see quotas set for the IT4I Cloud in the [Quota Limits][f] section.
## Creating First Instance
To create your first VM instance, follow the steps below:
### Log In
Go to [https://cloud.it4i.cz][6], enter your LDAP username and password and choose the `IT4I_LDAP` domain. After you sign in, you will be redirected to the dashboard.
![](../img/login.png)
### Create Key Pair
SSH key is required for remote access to your instance.
1. Go to **Project > Compute > Key Pairs** and click the **Create Key Pair** button.
![](../img/keypairs.png)
1. In the Create Key Pair window, name your key pair, select `SSH Key` for key type and confirm by clicking Create Key Pair.
![](../img/keypairs1.png)
1. Download and manage the private key according to your operating system.
### Update Security Group
To be able to remotely access your VM instance, you have to allow access in the security group.
1. Go to **Project > Network > Security Groups** and click on **Manage Rules**, for the default security group.
![](../img/securityg.png)
1. Click on **Add Rule**, choose **SSH**, and leave the remaining fields unchanged.
![](../img/securityg1.png)
### Create VM Instance
1. In **Compute > Instances**, click **Launch Instance**.
![](../img/instance.png)
1. Choose Instance Name, Description, and number of instances. Click **Next**.
![](../img/instance1.png)
1. Choose an image from which to boot the instance. Choose to delete the volume after instance delete. Click **Next**.
![](../img/instance2.png)
1. Choose the hardware resources of the instance by selecting a flavor. Additional volumes for data can be attached later on. Click **Next**.
![](../img/instance3.png)
1. Select the network and continue to **Security Groups**.
![](../img/instance4.png)
1. Allocate the security group with SSH rule that you added in the [Update Security Group](it4i-cloud.md#update-security-group) step. Then click **Next** to go to the **Key Pair**.
![](../img/securityg2.png)
1. Select the key that you created in the [Create Key Pair][g] section and launch the instance.
![](../img/instance5.png)
### Associate Floating IP
1. Click on the **Associate** button next to the floating IP.
![](../img/floatingip.png)
1. Select Port to be associated with the instance, then click the **Associate** button.
Now you can join the VM using your preferred SSH client.
## Process Automatization
You can automate the process using Openstack.
### OpenStack
Prerequisites:
* Linux/Mac/WSL terminal BASH shell
* installed [OpenStack client][7]
Follow the guide: [https://code.it4i.cz/commandline][10]
Run commands:
```console
source project_openrc.sh.inc
```
```console
./cmdline-demo.sh basic-infrastructure-1
```
[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
[3]: https://docs.e-infra.cz/account/
[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-g2-site/quota-limits/
[6]: https://cloud.it4i.cz
[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
[11]: https://www.it4i.cz/en/for-users/computing-resources-allocation
[12]: mailto:support@it4i.cz @@
[a]: ../karolina/introduction.md
[b]: ../general/access/project-access.md
[c]: einfracz-cloud.md
[d]: ../general/accessing-the-clusters/vpn-access.md
[e]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[f]: it4i-quotas.md
[g]: it4i-cloud.md#create-key-pair
# IT4I Cloud Quotas
| Resource | Quota |
|---------------------------------------|-------|
| Instances | 10 |
| VCPUs | 20 |
| RAM | 32GB |
| Volumes | 20 |
| Volume Snapshots | 12 |
| Volume Storage | 500 |
| Floating-IPs | 1 |
| Security Groups | 10 |
| Security Group Rules | 100 |
| Networks | 1 |
| Ports | 10 |
| Routers | 1 |
| Backups | 12 |
| Groups | 10 |
| rbac_policies | 10 |
| Subnets | 1 |
| Subnet_pools | -1 |
| Fixed-ips | -1 |
| Injected-file-size | 10240 |
| Injected-path-size | 255 |
| Injected-files | 5 |
| Key-pairs | 100 |
| Properties | 128 |
| Server-groups | 10 |
| Server-group-members | 10 |
| Backup-gigabytes | 1002 |
| Per-volume-gigabytes | -1 |
# Accessing Complementary Systems
Complementary systems can be accessed at `login.cs.it4i.cz`
by any user with an active account assigned to an active project.
**SSH is required** to access Complementary systems.
## Data Storage
### Home
The `/home` file system is shared across all Complementary systems. Note that this file system is **not** shared with the file system on IT4I clusters.
### Scratch
There are local `/lscratch` storages on individual nodes.
### PROJECT
Complementary systems are connected to the [PROJECT storage][1].
[1]: ../storage/project-storage.md
# Using AMD Partition
For testing your application on the AMD partition,
you need to prepare a job script for that partition or use the interactive job:
```console
salloc -N 1 -c 64 -A PROJECT-ID -p p03-amd --gres=gpu:4 --time=08:00:00
```
where:
- `-N 1` means allocating one server,
- `-c 64` means allocating 64 cores,
- `-A` is your project,
- `-p p03-amd` is AMD partition,
- `--gres=gpu:4` means allocating all 4 GPUs of the node,
- `--time=08:00:00` means allocation for 8 hours.
You have also an option to allocate subset of the resources only,
by reducing the `-c` and `--gres=gpu` to smaller values.
```console
salloc -N 1 -c 48 -A PROJECT-ID -p p03-amd --gres=gpu:3 --time=08:00:00
salloc -N 1 -c 32 -A PROJECT-ID -p p03-amd --gres=gpu:2 --time=08:00:00
salloc -N 1 -c 16 -A PROJECT-ID -p p03-amd --gres=gpu:1 --time=08:00:00
```
!!! Note
p03-amd01 server has hyperthreading **enabled** therefore htop shows 128 cores.<br>
p03-amd02 server has hyperthreading **disabled** therefore htop shows 64 cores.
## Using AMD MI100 GPUs
The AMD GPUs can be programmed using the [ROCm open-source platform](https://docs.amd.com/).
ROCm and related libraries are installed directly in the system.
You can find it here:
```console
/opt/rocm/
```
The actual version can be found here:
```console
[user@p03-amd02.cs]$ cat /opt/rocm/.info/version
5.5.1-74
```
## Basic HIP Code
The first way how to program AMD GPUs is to use HIP.
The basic vector addition code in HIP looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `vector_add.hip.cpp`.
```console
#include <cstdio>
#include <hip/hip_runtime.h>
__global__ void add_vectors(float * x, float * y, float alpha, int count)
{
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < count)
y[idx] += alpha * x[idx];
}
int main()
{
// number of elements in the vectors
long long count = 10;
// allocation and initialization of data on the host (CPU memory)
float * h_x = new float[count];
float * h_y = new float[count];
for(long long i = 0; i < count; i++)
{
h_x[i] = i;
h_y[i] = 10 * i;
}
// print the input data
printf("X:");
for(long long i = 0; i < count; i++)
printf(" %7.2f", h_x[i]);
printf("\n");
printf("Y:");
for(long long i = 0; i < count; i++)
printf(" %7.2f", h_y[i]);
printf("\n");
// allocation of memory on the GPU device
float * d_x;
float * d_y;
hipMalloc(&d_x, count * sizeof(float));
hipMalloc(&d_y, count * sizeof(float));
// copy the data from host memory to the device
hipMemcpy(d_x, h_x, count * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_y, h_y, count * sizeof(float), hipMemcpyHostToDevice);
int tpb = 256;
int bpg = (count - 1) / tpb + 1;
// launch the kernel on the GPU
add_vectors<<< bpg, tpb >>>(d_x, d_y, 100, count);
// hipLaunchKernelGGL(add_vectors, bpg, tpb, 0, 0, d_x, d_y, 100, count);
// copy the result back to CPU memory
hipMemcpy(h_y, d_y, count * sizeof(float), hipMemcpyDeviceToHost);
// print the results
printf("Y:");
for(long long i = 0; i < count; i++)
printf(" %7.2f", h_y[i]);
printf("\n");
// free the allocated memory
hipFree(d_x);
hipFree(d_y);
delete[] h_x;
delete[] h_y;
return 0;
}
```
To compile the code we use `hipcc` compiler.
For compiler information, use `hipcc --version`:
```console
[user@p03-amd02.cs ~]$ hipcc --version
HIP version: 5.5.30202-eaf00c0b
AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.5.1 23194 69ef12a7c3cc5b0ccf820bc007bd87e8b3ac3037)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/rocm-5.5.1/llvm/bin
```
The code is compiled a follows:
```console
hipcc vector_add.hip.cpp -o vector_add.x
```
The correct output of the code is:
```console
[user@p03-amd02.cs ~]$ ./vector_add.x
X: 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
Y: 0.00 10.00 20.00 30.00 40.00 50.00 60.00 70.00 80.00 90.00
Y: 0.00 110.00 220.00 330.00 440.00 550.00 660.00 770.00 880.00 990.00
```
More details on HIP programming is in the [HIP Programming Guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.5/page/Introduction_to_HIP_Programming_Guide.html)
## HIP and ROCm Libraries
The list of official AMD libraries can be found [here](https://docs.amd.com/category/libraries).
The libraries are installed in the same directory is ROCm
```console
/opt/rocm/
```
Following libraries are installed:
```console
drwxr-xr-x 4 root root 44 Jun 7 14:09 hipblas
drwxr-xr-x 3 root root 17 Jun 7 14:09 hipblas-clients
drwxr-xr-x 3 root root 29 Jun 7 14:09 hipcub
drwxr-xr-x 4 root root 44 Jun 7 14:09 hipfft
drwxr-xr-x 3 root root 25 Jun 7 14:09 hipfort
drwxr-xr-x 4 root root 32 Jun 7 14:09 hiprand
drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsolver
drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsparse
```
and
```console
drwxr-xr-x 4 root root 32 Jun 7 14:09 rocalution
drwxr-xr-x 4 root root 44 Jun 7 14:09 rocblas
drwxr-xr-x 4 root root 44 Jun 7 14:09 rocfft
drwxr-xr-x 4 root root 32 Jun 7 14:09 rocprim
drwxr-xr-x 4 root root 32 Jun 7 14:09 rocrand
drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsolver
drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsparse
drwxr-xr-x 3 root root 29 Jun 7 14:09 rocthrust
```
## Using HipBlas Library
The basic code in HIP that uses hipBlas looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `hipblas.hip.cpp`.
```console
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
int main()
{
srand(9600);
int width = 10;
int height = 7;
int elem_count = width * height;
// initialization of data in CPU memory
float * h_A;
hipHostMalloc(&h_A, elem_count * sizeof(*h_A));
for(int i = 0; i < elem_count; i++)
h_A[i] = (100.0f * rand()) / (float)RAND_MAX;
printf("Matrix A:\n");
for(int r = 0; r < height; r++)
{
for(int c = 0; c < width; c++)
printf("%6.3f ", h_A[r + height * c]);
printf("\n");
}
float * h_x;
hipHostMalloc(&h_x, width * sizeof(*h_x));
for(int i = 0; i < width; i++)
h_x[i] = (100.0f * rand()) / (float)RAND_MAX;
printf("vector x:\n");
for(int i = 0; i < width; i++)
printf("%6.3f ", h_x[i]);
printf("\n");
float * h_y;
hipHostMalloc(&h_y, height * sizeof(*h_y));
for(int i = 0; i < height; i++)
h_x[i] = 100.0f + i;
printf("vector y:\n");
for(int i = 0; i < height; i++)
printf("%6.3f ", h_x[i]);
printf("\n");
// initialization of data in GPU memory
float * d_A;
size_t pitch_A;
hipMallocPitch((void**)&d_A, &pitch_A, height * sizeof(*d_A), width);
hipMemcpy2D(d_A, pitch_A, h_A, height * sizeof(*d_A), height * sizeof(*d_A), width, hipMemcpyHostToDevice);
int lda = pitch_A / sizeof(float);
float * d_x;
hipMalloc(&d_x, width * sizeof(*d_x));
hipMemcpy(d_x, h_x, width * sizeof(*d_x), hipMemcpyHostToDevice);
float * d_y;
hipMalloc(&d_y, height * sizeof(*d_y));
hipMemcpy(d_y, h_y, height * sizeof(*d_y), hipMemcpyHostToDevice);
// basic calculation of the result on the CPU
float alpha=2.0f, beta=10.0f;
for(int i = 0; i < height; i++)
h_y[i] *= beta;
for(int r = 0; r < height; r++)
for(int c = 0; c < width; c++)
h_y[r] += alpha * h_x[c] * h_A[r + height * c];
printf("result y CPU:\n");
for(int i = 0; i < height; i++)
printf("%6.3f ", h_y[i]);
printf("\n");
// calculation of the result on the GPU using the hipBLAS library
hipblasHandle_t blas_handle;
hipblasCreate(&blas_handle);
hipblasSgemv(blas_handle, HIPBLAS_OP_N, height, width, &alpha, d_A, lda, d_x, 1, &beta, d_y, 1);
hipDeviceSynchronize();
hipblasDestroy(blas_handle);
// copy the GPU result to CPU memory and print it
hipMemcpy(h_y, d_y, height * sizeof(*d_y), hipMemcpyDeviceToHost);
printf("result y BLAS:\n");
for(int i = 0; i < height; i++)
printf("%6.3f ", h_y[i]);
printf("\n");
// free all the allocated memory
hipFree(d_A);
hipFree(d_x);
hipFree(d_y);
hipHostFree(h_A);
hipHostFree(h_x);
hipHostFree(h_y);
return 0;
}
```
The code compilation can be done as follows:
```console
hipcc hipblas.hip.cpp -o hipblas.x -lhipblas
```
## Using HipSolver Library
The basic code in HIP that uses hipSolver looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `hipsolver.hip.cpp`.
```console
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <algorithm>
#include <hipsolver/hipsolver.h>
#include <hipblas/hipblas.h>
int main()
{
srand(63456);
int size = 10;
// allocation and initialization of data on host. this time we use std::vector
int h_A_ld = size;
int h_A_pitch = h_A_ld * sizeof(float);
std::vector<float> h_A(size * h_A_ld);
for(int r = 0; r < size; r++)
for(int c = 0; c < size; c++)
h_A[r * h_A_ld + c] = (10.0 * rand()) / RAND_MAX;
printf("System matrix A:\n");
for(int r = 0; r < size; r++)
{
for(int c = 0; c < size; c++)
printf("%6.3f ", h_A[r * h_A_ld + c]);
printf("\n");
}
std::vector<float> h_b(size);
for(int i = 0; i < size; i++)
h_b[i] = (10.0 * rand()) / RAND_MAX;
printf("RHS vector b:\n");
for(int i = 0; i < size; i++)
printf("%6.3f ", h_b[i]);
printf("\n");
std::vector<float> h_x(size);
// memory allocation on the device and initialization
float * d_A;
size_t d_A_pitch;
hipMallocPitch((void**)&d_A, &d_A_pitch, size, size);
int d_A_ld = d_A_pitch / sizeof(float);
float * d_b;
hipMalloc(&d_b, size * sizeof(float));
float * d_x;
hipMalloc(&d_x, size * sizeof(float));
int * d_piv;
hipMalloc(&d_piv, size * sizeof(int));
int * info;
hipMallocManaged(&info, sizeof(int));
hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), size * sizeof(float), hipMemcpyHostToDevice);
// solving the system using hipSOLVER
hipsolverHandle_t solverHandle;
hipsolverCreate(&solverHandle);
int wss_trf, wss_trs; // wss = WorkSpace Size
hipsolverSgetrf_bufferSize(solverHandle, size, size, d_A, d_A_ld, &wss_trf);
hipsolverSgetrs_bufferSize(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, &wss_trs);
float * workspace;
int wss = std::max(wss_trf, wss_trs);
hipMalloc(&workspace, wss * sizeof(float));
hipsolverSgetrf(solverHandle, size, size, d_A, d_A_ld, workspace, wss, d_piv, info);
hipsolverSgetrs(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, workspace, wss, info);
hipMemcpy(d_x, d_b, size * sizeof(float), hipMemcpyDeviceToDevice);
hipMemcpy(h_x.data(), d_x, size * sizeof(float), hipMemcpyDeviceToHost);
printf("Solution vector x:\n");
for(int i = 0; i < size; i++)
printf("%6.3f ", h_x[i]);
printf("\n");
hipFree(workspace);
hipsolverDestroy(solverHandle);
// perform matrix-vector multiplication A*x using hipBLAS to check if the solution is correct
hipblasHandle_t blasHandle;
hipblasCreate(&blasHandle);
float alpha = 1;
float beta = 0;
hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
hipblasSgemv(blasHandle, HIPBLAS_OP_N, size, size, &alpha, d_A, d_A_ld, d_x, 1, &beta, d_b, 1);
hipDeviceSynchronize();
hipblasDestroy(blasHandle);
for(int i = 0; i < size; i++)
h_b[i] = 0;
hipMemcpy(h_b.data(), d_b, size * sizeof(float), hipMemcpyDeviceToHost);
printf("Check multiplication vector Ax:\n");
for(int i = 0; i < size; i++)
printf("%6.3f ", h_b[i]);
printf("\n");
// free all the allocated memory
hipFree(info);
hipFree(d_piv);
hipFree(d_x);
hipFree(d_b);
hipFree(d_A);
return 0;
}
```
The code compilation can be done as follows:
```console
hipcc hipsolver.hip.cpp -o hipsolver.x -lhipblas -lhipsolver
```
## Using OpenMP Offload to Program AMD GPUs
The ROCm™ installation includes an LLVM-based implementation that fully supports the OpenMP 4.5 standard
and a subset of the OpenMP 5.0 standard.
Fortran, C/C++ compilers, and corresponding runtime libraries are included.
The OpenMP toolchain is automatically installed as part of the standard ROCm installation
and is available under `/opt/rocm/llvm`. The sub-directories are:
- `bin` : Compilers (flang and clang) and other binaries.
- `examples` : The usage section below shows how to compile and run these programs.
- `include` : Header files.
- `lib` : Libraries including those required for target offload.
- `lib-debug` : Debug versions of the above libraries.
More information can be found in the [AMD OpenMP Support Guide](https://docs.amd.com/bundle/OpenMP-Support-Guide-v5.5/page/Introduction_to_OpenMP_Support_Guide.html).
## Compilation of OpenMP Code
Basic example that uses OpenMP offload is here.
Again, code is complete and can be copied and pasted into a file.
Here we use `vadd.cpp`.
```console
#include <cstdio>
#include <cstdlib>
int main(int argc, char ** argv)
{
long long count = 1 << 20;
if(argc > 1)
count = atoll(argv[1]);
long long print_count = 16;
if(argc > 2)
print_count = atoll(argv[2]);
long long * a = new long long[count];
long long * b = new long long[count];
long long * c = new long long[count];
#pragma omp parallel for
for(long long i = 0; i < count; i++)
{
a[i] = i;
b[i] = 10 * i;
}
printf("A: ");
for(long long i = 0; i < print_count; i++)
printf("%3lld ", a[i]);
printf("\n");
printf("B: ");
for(long long i = 0; i < print_count; i++)
printf("%3lld ", b[i]);
printf("\n");
#pragma omp target map(to: a[0:count],b[0:count]) map(from: c[0:count])
#pragma omp teams distribute parallel for
for(long long i = 0; i < count; i++)
{
c[i] = a[i] + b[i];
}
printf("C: ");
for(long long i = 0; i < print_count; i++)
printf("%3lld ", c[i]);
printf("\n");
delete[] a;
delete[] b;
delete[] c;
return 0;
}
```
This code can be compiled like this:
```console
/opt/rocm/llvm/bin/clang++ -O3 -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 vadd.cpp -o vadd.x
```
These options are required for target offload from an OpenMP program:
- `-target x86_64-pc-linux-gnu`
- `-fopenmp`
- `-fopenmp-targets=amdgcn-amd-amdhsa`
- `-Xopenmp-target=amdgcn-amd-amdhsa`
This flag specifies the GPU architecture of targeted GPU.
You need to chage this when moving for instance to LUMI with MI250X GPU.
The MI100 GPUs presented in CS have code `gfx908`:
- `-march=gfx908`
Note: You also have to include the `O0`, `O2`, `O3` or `O3` flag.
Without this flag the execution of the compiled code fails.