Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • chat
  • kru0052-master-patch-91081
  • lifecycle
  • master
  • 20180621-before_revision
  • 20180621-revision
6 results

Target

Select target project
No results found
Select Git revision
  • 45-dokumentace-singularity
  • 59-john-s-froofreading
  • MPDATABenchmark
  • add_soft
  • fix-nodefiles
  • fix_bugs
  • fix_vnc_access_howto
  • hot-fix
  • hot_fix
  • job-features
  • master
  • patch-1
  • patch-2
  • readme
  • revert-1dceddf5
  • spell_check
  • test_jc
  • unittest
  • update_mkdocs
19 results
Show changes
565 files
+ 4561
0
Compare changes
  • Side-by-side
  • Inline

Files

.gitignore

0 → 100644
+3 −0
Original line number Diff line number Diff line
site/
scripts/*.csv
venv/

.gitlab-ci.yml

0 → 100644
+137 −0
Original line number Diff line number Diff line
stages:
  - test
  - build
  - deploy
  - after_test

variables:
    PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"

docs:
  stage: test
  image: it4innovations/docker-mdcheck:latest
  allow_failure: true
  script:
  - find content/docs -name "*.mdx" | xargs mdl -r ~MD002,~MD007,~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046

pylint:
  stage: test
  image: it4innovations/docker-pycheck:latest
  before_script:
  - source /opt/.venv3/bin/activate
  script:
  - pylint $(find . -name "*.py" -not -name "feslicescript.py")

capitalize:
  stage: test
  image: it4innovations/docker-mkdocscheck:latest
  allow_failure: true
  before_script:
  - source /opt/.venv3/bin/activate
  - python -V # debug
  - pip list | grep titlecase 
  script:
  - find content/docs/ \( -name '*.mdx' -o -name '*.yml' \) ! -path '*einfracz*' -print0 | xargs -0 -n1 scripts/titlemd.py --test

ext_links:
  stage: after_test
  image: it4innovations/docker-mdcheck:latest
  allow_failure: true
  after_script:
  # remove JSON results
  - rm *.json
  script:
  - find content/docs -name '*.mdx' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
  only:
  - master

404s:
  stage: after_test
  image: it4innovations/docker-mkdocscheck:latest
  before_script:
  - echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
  - wget -V
  - echo https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
  - wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/ || true
  script:
  - cat wget.log | awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'

mkdocs:
  stage: build
  image: it4innovations/docker-mkdocscheck:latest
  before_script:
  - source /opt/.venv3/bin/activate
  - python -V # debug
  - pip install -r requirements.txt
  - pip freeze # debug
  - mkdocs -V # debug
  script:
    # add version to footer
  - bash scripts/add_version.sh
    # get modules list from clusters
  - bash scripts/get_modules.sh
    # generate site_url
  - (if [ "${CI_COMMIT_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_COMMIT_REF_NAME\//" mkdocs.yml;fi);
    # generate ULT for code link
#  - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
    # regenerate modules matrix
  - python scripts/modules_matrix.py > docs.it4i/modules-matrix.md
  - python scripts/modules_matrix.py --json > docs.it4i/modules-matrix.json
  - curl -f0 https://code.it4i.cz/sccs/scs-api-public/raw/master/scs_api.server_public.md -o docs.it4i/apiv1.md
    # build pages
  - mkdocs build
    # replace broken links in 404.html
  - sed -i 's,href="" title=",href="/" title=",g' site/404.html
  - cp site/404.html site/403.html
  - sed -i 's/404 - Not found/403 - Forbidden/g' site/403.html
    # compress sitemap
  - gzip < site/sitemap.xml > site/sitemap.xml.gz
  artifacts:
    paths:
    - site
    expire_in: 1 week

deploy to stage:
  environment: stage
  stage: deploy
  image: it4innovations/docker-mkdocscheck:latest
  before_script:
  # install ssh-agent
  - 'which ssh-agent || ( apt-get update -y && apt-get install openssh-client -y )'
  - 'which rsync || ( apt-get update -y && apt-get install rsync -y )'
  # run ssh-agent
  - eval $(ssh-agent -s)
  # add ssh key stored in SSH_PRIVATE_KEY variable to the agent store
  - ssh-add <(echo "$SSH_PRIVATE_KEY")
  # disable host key checking (NOTE: makes you susceptible to man-in-the-middle attacks)
  # WARNING: use only in docker container, if you use it with shell you will overwrite your user's ssh config
  - mkdir -p ~/.ssh
  - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
  script:
  - chown nginx:nginx site -R
  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
  only:
  - branches@sccs/docs.it4i.cz

deploy to production:
  environment: production
  stage: deploy
  image: it4innovations/docker-mkdocscheck:latest
  before_script:
  # install ssh-agent
  - 'which ssh-agent || ( apt-get update -y && apt-get install openssh-client -y )'
  - 'which rsync || ( apt-get update -y && apt-get install rsync -y )'
  # run ssh-agent
  - eval $(ssh-agent -s)
  # add ssh key stored in SSH_PRIVATE_KEY variable to the agent store
  - ssh-add <(echo "$SSH_PRIVATE_KEY")
  # disable host key checking (NOTE: makes you susceptible to man-in-the-middle attacks)
  # WARNING: use only in docker container, if you use it with shell you will overwrite your user's ssh config
  - mkdir -p ~/.ssh
  - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
  script:
  - chown nginx:nginx site -R
  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/site/
  only:
  - master@sccs/docs.it4i.cz
  when: manual

.spelling

0 → 100644
+851 −0
Original line number Diff line number Diff line
Quantum Scalar I6
JAN
LUMI
AI
CI/CD
AWS
CLI
FAQ
s3cmd
GUI
EESSI
hipBlas
hipSolver
LUMI
apptainer
ROCm
HIP
NVIDIA DGX-2
nvidia
smi
nvidia-smi
NICE
DGX-2
DGX
DCV
In
CAE
CUBE
GPU
GSL
LMGC90
LS-DYNA
MAPDL
GPI-2
COM
.ssh
Anselm
IT4I
IT4Innovations
PBS
vnode
vnodes
Salomon
TurboVNC
VNC
DDR3
DIMM
InfiniBand
CUDA
ORCA
COMSOL
API
GNU
CUDA
NVIDIA
LiveLink
MATLAB
Allinea
LLNL
Vampir
Doxygen
VTune
TotalView
Valgrind
ParaView
OpenFOAM
MAX_FAIRSHARE
MPI4Py
MPICH2
PETSc
Trilinos
FFTW
HDF5
BiERapp
AVX
AVX2
JRE
JDK
QEMU
VMware
VirtualBox
NUMA
SMP
BLAS
LAPACK
FFTW3
Dongarra
OpenCL
cuBLAS
CESNET
Jihlava
NVIDIA
Xeon
ANSYS
CentOS
RHEL
DDR4
DIMMs
GDDR5
EasyBuild
e.g.
MPICH
MVAPICH2
OpenBLAS
ScaLAPACK
PAPI
SGI
UV2000
VM
400GB
Mellanox
RedHat
ssh.du1.cesnet.cz
ssh.du2.cesnet.cz
ssh.du3.cesnet.cz
DECI
supercomputing
AnyConnect
X11
backfilling
backfilled
SCP
Lustre
QDR
TFLOP
ncpus
myjob
pernode
mpiprocs
ompthreads
qprace
runtime
SVS
ppn
Multiphysics
aeroacoustics
turbomachinery
CFD
LS-DYNA
APDL
MAPDL
multiphysics
AUTODYN
RSM
Molpro
initio
parallelization
NWChem
SCF
ISV
profiler
Pthreads
profilers
OTF
PAPI
PCM
uncore
pre-processing
prepend
CXX
prepended
POMP2
Memcheck
unaddressable
OTF2
GPI-2
GASPI
GPI
MKL
IPP
TBB
GSL
Omics
VNC
Scalasca
IFORT
interprocedural
IDB
cloop
qcow
qcow2
vmdk
vdi
virtio
paravirtualized
Gbit
tap0
UDP
TCP
preload
qfat
Rmpi
DCT
datasets
dataset
preconditioners
partitioners
PARDISO
PaStiX
SuiteSparse
SuperLU
ExodusII
NetCDF
ParMETIS
multigrid
HYPRE
SPAI
Epetra
EpetraExt
Tpetra
64-bit
Belos
GMRES
Amesos
IFPACK
preconditioner
Teuchos
Makefiles
SAXPY
NVCC
VCF
HGMD
HUMSAVAR
ClinVar
indels
CIBERER
exomes
tmp
SSHFS
RSYNC
unmount
Cygwin
CygwinX
RFB
TightVNC
TigerVNC
GUIs
XLaunch
UTF-8
numpad
PuTTYgen
OpenSSH
IE11
x86
r21u01n577
7120P
interprocessor
IPN
toolchains
toolchain
APIs
easyblocks
GM200
GeForce
GTX
IRUs
ASIC
backplane
ICEX
IRU
PFLOP
T950B
ifconfig
inet
addr
checkbox
appfile
programmatically
http
https
filesystem
phono3py
HDF
splitted
automize
llvm
PGI
GUPC
BUPC
IBV
Aislinn
nondeterminism
stdout
stderr
i.e.
pthreads
uninitialised
broadcasted
ITAC
hotspots
Bioinformatics
semiempirical
DFT
polyfill
ES6
HTML5Rocks
minifiers
CommonJS
PhantomJS
bundlers
Browserify
versioning
isflowing
ispaused
NPM
sublicense
Streams2
Streams3
blogpost
GPG
mississippi
Uint8Arrays
Uint8Array
endianness
styleguide
noop
MkDocs
 - docs.it4i/anselm-cluster-documentation/environment-and-modules.md
MODULEPATH
bashrc
PrgEnv-gnu
bullx
MPI
PrgEnv-intel
EasyBuild
 - docs.it4i/anselm-cluster-documentation/capacity-computing.md
capacity.zip
README
 - docs.it4i/anselm-cluster-documentation/compute-nodes.md
DIMMs
 - docs.it4i/anselm-cluster-documentation/hardware-overview.md
cn
K20
Xeon
x86-64
Virtualization
virtualization
NVIDIA
5110P
SSD
lscratch
login1
login2
dm1
Rpeak
LINPACK
Rmax
E5-2665
E5-2470
P5110
isw
 - docs.it4i/anselm-cluster-documentation/introduction.md
RedHat
 - docs.it4i/anselm-cluster-documentation/job-priority.md
walltime
qexp
_List.fairshare
_time
_FAIRSHARE
1E6
 - docs.it4i/anselm-cluster-documentation/job-submission-and-execution.md
15209.srv11
qsub
15210.srv11
pwd
cn17.bullx
cn108.bullx
cn109.bullx
cn110.bullx
pdsh
hostname
SCRDIR
mkdir
mpiexec
qprod
Jobscript
jobscript
cn108
cn109
cn110
Name0
cn17
_NODEFILE
_O
_WORKDIR
mympiprog.x
_JOBID
myprog.x
openmpi
 - docs.it4i/anselm-cluster-documentation/network.md
ib0
 - docs.it4i/anselm-cluster-documentation/prace.md
PRACE
qfree
it4ifree
it4i.portal.clients
prace
1h
 - docs.it4i/anselm-cluster-documentation/shell-and-data-access.md
VPN
 - docs.it4i/anselm-cluster-documentation/software/ansys/ansys-cfx.md
ANSYS
CFX
cfx.pbs
_r
ane3fl
 - docs.it4i/anselm-cluster-documentation/software/ansys/ansys-mechanical-apdl.md
mapdl.pbs
_dy
 - docs.it4i/anselm-cluster-documentation/software/ansys/ls-dyna.md
HPC
lsdyna.pbs
 - docs.it4i/anselm-cluster-documentation/software/chemistry/molpro.md
OpenMP
 - docs.it4i/anselm-cluster-documentation/software/compilers.md
Fortran
 - docs.it4i/anselm-cluster-documentation/software/debuggers/intel-performance-counter-monitor.md
E5-2600
 - docs.it4i/anselm-cluster-documentation/software/debuggers/score-p.md
Makefile
 - docs.it4i/anselm-cluster-documentation/software/gpi2.md
gcc
cn79
helloworld
_gpi.c
ibverbs
gaspi
_logger
 - docs.it4i/anselm-cluster-documentation/software/intel-suite/intel-compilers.md
Haswell
CPUs
ipo
O3
vec
xAVX
omp
simd
ivdep
pragmas
openmp
xCORE-AVX2
axCORE-AVX2
 - docs.it4i/anselm-cluster-documentation/software/kvirtualization.md
rc.local
runlevel
RDP
DHCP
DNS
SMB
VDE
smb.conf
TMPDIR
run.bat.
slirp
NATs
 - docs.it4i/anselm-cluster-documentation/software/mpi/mpi4py-mpi-for-python.md
NumPy
 - docs.it4i/anselm-cluster-documentation/software/numerical-languages/matlab_1314.md
mpiLibConf.m
matlabcode.m
output.out
matlabcodefile
sched
_feature
 - docs.it4i/anselm-cluster-documentation/software/numerical-languages/matlab.md
UV2000
maxNumCompThreads
SalomonPBSPro
 - docs.it4i/anselm-cluster-documentation/software/numerical-languages/octave.md
_THREADS
_NUM
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/trilinos.md
CMake-aware
Makefile.export
_PACKAGE
_CXX
_COMPILER
_INCLUDE
_DIRS
_LIBRARY
 - docs.it4i/anselm-cluster-documentation/software/ansys/ansys-ls-dyna.md
ansysdyna.pbs
 - docs.it4i/anselm-cluster-documentation/software/ansys/ansys.md
svsfem.cz
_
 - docs.it4i/anselm-cluster-documentation/software/debuggers/valgrind.md
libmpiwrap-amd64-linux
O0
valgrind
malloc
_PRELOAD
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/magma-for-intel-xeon-phi.md
cn204
_LIBS
MAGMAROOT
_magma
_server
_anselm
_from
_mic.sh
_dgetrf
_mic
_03.pdf
 - docs.it4i/anselm-cluster-documentation/software/paraview.md
cn77
localhost
v4.0.1
 - docs.it4i/anselm-cluster-documentation/storage.md
ssh.du1.cesnet.cz
Plzen
ssh.du2.cesnet.cz
ssh.du3.cesnet.cz
tier1
_home
_cache
_tape
 - docs.it4i/salomon/environment-and-modules.md
icc
ictce
ifort
imkl
intel
gompi
goolf
BLACS
iompi
iccifort
 - docs.it4i/salomon/hardware-overview.md
HW
E5-4627v2
 - docs.it4i/salomon/job-submission-and-execution.md
15209.isrv5
r21u01n577
r21u02n578
r21u03n579
r21u04n580
qsub
15210.isrv5
pwd
r2i5n6.ib0.smc.salomon.it4i.cz
r4i6n13.ib0.smc.salomon.it4i.cz
r4i7n2.ib0.smc.salomon.it4i.cz
pdsh
r2i5n6
r4i6n13
r4i7n
r4i7n2
r4i7n0
SCRDIR
myjob
mkdir
mympiprog.x
mpiexec
myprog.x
r4i7n0.ib0.smc.salomon.it4i.cz
 - docs.it4i/salomon/7d-enhanced-hypercube.md
cns1
cns576
r1i0n0
r4i7n17
cns577
cns1008
r37u31n1008
7D
 - docs.it4i/anselm-cluster-documentation/resources-allocation-policy.md
qsub
it4ifree
it4i.portal.clients
x86
x64
 - docs.it4i/anselm-cluster-documentation/software/ansys/ansys-fluent.md
anslic
_admin
 - docs.it4i/anselm-cluster-documentation/software/chemistry/nwchem.md
_DIR
 - docs.it4i/anselm-cluster-documentation/software/comsol-multiphysics.md
EDU
comsol
_matlab.pbs
_job.m
mphstart
 - docs.it4i/anselm-cluster-documentation/software/debuggers/allinea-performance-reports.md
perf-report
perf
txt
html
mympiprog
_32p
 - docs.it4i/anselm-cluster-documentation/software/debuggers/intel-vtune-amplifier.md
Hotspots
 - docs.it4i/anselm-cluster-documentation/software/debuggers/scalasca.md
scorep
 - docs.it4i/anselm-cluster-documentation/software/isv_licenses.md
edu
ansys
_features
_state.txt
f1
matlab
acfd
_ansys
_acfd
_aa
_comsol
HEATTRANSFER
_HEATTRANSFER
COMSOLBATCH
_COMSOLBATCH
STRUCTURALMECHANICS
_STRUCTURALMECHANICS
_matlab
_Toolbox
_Image
_Distrib
_Comp
_Engine
_Acquisition
pmode
matlabpool
 - docs.it4i/anselm-cluster-documentation/software/mpi/mpi.md
mpirun
BLAS1
FFT
KMP
_AFFINITY
GOMP
_CPU
bullxmpi-1
mpich2
 - docs.it4i/anselm-cluster-documentation/software/mpi/Running_OpenMPI.md
bysocket
bycore
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/fftw.md
gcc3.3.3
pthread
fftw3
lfftw3
_threads-lfftw3
_omp
icc3.3.3
FFTW2
gcc2.1.5
fftw2
lfftw
_threads
icc2.1.5
fftw-mpi3
_mpi
fftw3-mpi
fftw2-mpi
IntelMPI
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/gsl.md
dwt.c
mkl
lgsl
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/hdf5.md
icc
hdf5
_INC
_SHLIB
_CPP
_LIB
_F90
gcc49
 - docs.it4i/anselm-cluster-documentation/software/numerical-libraries/petsc.md
_Dist
 - docs.it4i/anselm-cluster-documentation/software/nvidia-cuda.md
lcublas
 - docs.it4i/anselm-cluster-documentation/software/operating-system.md
6.x
 - docs.it4i/get-started-with-it4innovations/accessing-the-clusters/graphical-user-interface/cygwin-and-x11-forwarding.md
startxwin
cygwin64binXWin.exe
tcp
 - docs.it4i/get-started-with-it4innovations/accessing-the-clusters/graphical-user-interface/x-window-system.md
Xming
XWin.exe.
 - docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/pageant.md
_rsa.ppk
 - docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/puttygen.md
_keys
organization.example.com
_rsa
 - docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/vpn-connection-fail-in-win-8.1.md
vpnui.exe
 - docs.it4i/salomon/ib-single-plane-topology.md
36-port
Mcell.pdf
r21-r38
nodes.pdf
 - docs.it4i/salomon/introduction.md
E5-2680v3
 - docs.it4i/salomon/network.md
r4i1n0
r4i1n1
r4i1n2
r4i1n3
ip
 - docs.it4i/salomon/software/ansys/setting-license-preferences.md
ansys161
 - docs.it4i/salomon/software/ansys/workbench.md
mpifile.txt
solvehandlers.xml
 - docs.it4i/salomon/software/chemistry/phono3py.md
vasprun.xml
disp-XXXXX
disp
_fc3.yaml
ir
_grid
_points.yaml
gofree-cond1
 - docs.it4i/salomon/software/compilers.md
HPF
 - docs.it4i/salomon/software/comsol/licensing-and-available-versions.md
ver
 - docs.it4i/salomon/software/debuggers/aislinn.md
test.cpp
 - docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md
vtune
_update1
 - docs.it4i/salomon/software/debuggers/valgrind.md
EBROOTVALGRIND
 - docs.it4i/salomon/software/intel-suite/intel-advisor.md
O2
 - docs.it4i/salomon/software/intel-suite/intel-compilers.md
UV1
 - docs.it4i/salomon/software/numerical-languages/octave.md
octcode.m
mkoctfile
 - docs.it4i/software/orca.md
pdf
 - node_modules/es6-promise/README.md
rsvp.js
es6-promise
es6-promise-min
Node.js
testem
 - node_modules/spawn-sync/lib/json-buffer/README.md
node.js
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/doc/wg-meetings/2015-01-30.md
WG
domenic
mikeal
io.js
sam
calvin
whatwg
compat
mathias
isaac
chris
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/core-util-is/README.md
core-util-is
v0.12.
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/isarray/README.md
isarray
Gruber
julian
juliangruber.com
NONINFRINGEMENT
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/process-nextick-args/license.md
Metcalf
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/process-nextick-args/readme.md
process-nextick-args
process.nextTick
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/string_decoder/README.md
_decoder.js
Joyent
joyent
repo
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/util-deprecate/History.md
kumavis
jsdocs
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/util-deprecate/README.md
util-deprecate
Rajlich
 - node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/README.md
v7.0.0
userland
chrisdickinson
christopher.s.dickinson
gmail.com
9554F04D7259F04124DE6B476D5A82AC7E37093B
calvinmetcalf
calvin.metcalf
F3EF5F62A87FC27A22E643F714CE4FF5015AA242
Vagg
rvagg
vagg.org
DD8F2338BAE7501E3DD5AC78C273792F7D83545D
sonewman
newmansam
outlook.com
Buus
mafintosh
mathiasbuus
Denicola
domenic.me
Matteo
Collina
mcollina
matteo.collina
3ABC01543F22DD2239285CDD818674489FBC127E
 - node_modules/spawn-sync/node_modules/concat-stream/readme.md
concat-stream
concat
cb
 - node_modules/spawn-sync/node_modules/os-shim/README.md
0.10.x
os.tmpdir
os.endianness
os.EOL
os.platform
os.arch
0.4.x
Aparicio
Adesis
Netlife
S.L
 - node_modules/spawn-sync/node_modules/try-thread-sleep/node_modules/thread-sleep/README.md
node-pre-gyp
npm
 - node_modules/spawn-sync/README.md
iojs
UCX
Dask-ssh
SCRATCH
HOME
PROJECT
e-INFRA
e-INFRA CZ
DICE
qgpu
qcpu
it4i-portal-clients
it4icheckaccess
it4idedicatedtime
it4ifree
it4ifsusage
it4iuserfsusage
it4iprojectfsusage
it4imotd
e-INFRA
it4i-portal-clients
s3cmd
s5cmd
title:
e-INFRA CZ Cloud Ostrava
e-INFRA CZ Account

README.md

0 → 100644
+8 −0
Original line number Diff line number Diff line
# IT4Inovations Documentation

This project contains IT4Innovations user documentation source.

## Migration

* [fumadocs](https://fumadocs.vercel.app/)
 No newline at end of file
+132 −0
Original line number Diff line number Diff line
# Compute Nodes

## Node Configuration

Anselm is a cluster of x86-64 Intel-based nodes built with the Bull Extreme Computing bullx technology. The cluster contains four types of compute nodes.

### Compute Nodes Without Accelerators

* 180 nodes
* 2880 cores in total
* two Intel Sandy Bridge E5-2665, 8-core, 2.4GHz processors per node
* 64 GB of physical memory per node
* one 500GB SATA 2,5” 7,2 krpm HDD per node
* bullx B510 blade servers
* cn[1-180]

### Compute Nodes With a GPU Accelerator

* 23 nodes
* 368 cores in total
* two Intel Sandy Bridge E5-2470, 8-core, 2.3GHz processors per node
* 96 GB of physical memory per node
* one 500GB SATA 2,5” 7,2 krpm HDD per node
* GPU accelerator 1x NVIDIA Tesla Kepler K20m per node
* bullx B515 blade servers
* cn[181-203]

### Compute Nodes With a MIC Accelerator

* 4 nodes
* 64 cores in total
* two Intel Sandy Bridge E5-2470, 8-core, 2.3GHz processors per node
* 96 GB of physical memory per node
* one 500GB SATA 2,5” 7,2 krpm HDD per node
* MIC accelerator 1x Intel Phi 5110P per node
* bullx B515 blade servers
* cn[204-207]

### Fat Compute Nodes

* 2 nodes
* 32 cores in total
* 2 Intel Sandy Bridge E5-2665, 8-core, 2.4GHz processors per node
* 512 GB of physical memory per node
* two 300GB SAS 3,5” 15krpm HDD (RAID1) per node
* two 100GB SLC SSD per node
* bullx R423-E3 servers
* cn[208-209]

![](../img/bullxB510.png)
**Anselm bullx B510 servers**

### Compute Node Summary

| Node type                    | Count | Range       | Memory | Cores       | Queues                                    |
| ---------------------------- | ----- | ----------- | ------ | ----------- | --------------------------------------    |
| Nodes without an accelerator | 180   | cn[1-180]   | 64GB   | 16 @ 2.4GHz | qexp, qprod, qlong, qfree, qprace, qatlas |
| Nodes with a GPU accelerator | 23    | cn[181-203] | 96GB   | 16 @ 2.3GHz | qnvidia, qexp                             |
| Nodes with a MIC accelerator | 4     | cn[204-207] | 96GB   | 16 @ 2.3GHz | qmic, qexp                                |
| Fat compute nodes            | 2     | cn[208-209] | 512GB  | 16 @ 2.4GHz | qfat, qexp                                |

## Processor Architecture

Anselm is equipped with Intel Sandy Bridge processors Intel Xeon E5-2665 (nodes without accelerators and fat nodes) and Intel Xeon E5-2470 (nodes with accelerators). The processors support Advanced Vector Extensions (AVX) 256-bit instruction set.

### Intel Sandy Bridge E5-2665 Processor

* eight-core
* speed: 2.4 GHz, up to 3.1 GHz using Turbo Boost Technology
* peak performance:  19.2 GFLOP/s per core
* caches:
  * L2: 256 KB per core
  * L3: 20 MB per processor
* memory bandwidth at the level of the processor: 51.2 GB/s

### Intel Sandy Bridge E5-2470 Processor

* eight-core
* speed: 2.3 GHz, up to 3.1 GHz using Turbo Boost Technology
* peak performance:  18.4 GFLOP/s per core
* caches:
  * L2: 256 KB per core
  * L3: 20 MB per processor
* memory bandwidth at the level of the processor: 38.4 GB/s

Nodes equipped with Intel Xeon E5-2665 CPU have a set PBS resource attribute cpu_freq = 24, nodes equipped with Intel Xeon E5-2470 CPU have set PBS resource attribute cpu_freq = 23.

```console
$ qsub -A OPEN-0-0 -q qprod -l select=4:ncpus=16:cpu_freq=24 -I
```

In this example, we allocate 4 nodes, 16 cores at 2.4GHhz per node.

Intel Turbo Boost Technology is used by default, you can disable it for all nodes of job by using the cpu_turbo_boost resource attribute.

```console
$ qsub -A OPEN-0-0 -q qprod -l select=4:ncpus=16 -l cpu_turbo_boost=0 -I
```

## Memmory Architecture

The cluster contains three types of compute nodes.

### Compute Nodes Without Accelerators

* 2 sockets
* Memory Controllers are integrated into processors.
  * 8 DDR3 DIMMs per node
  * 4 DDR3 DIMMs per CPU
  * 1 DDR3 DIMMs per channel
  * Data rate support: up to 1600MT/s
* Populated memory: 8 x 8 GB DDR3 DIMM 1600 MHz

### Compute Nodes With a GPU or MIC Accelerator

* 2 sockets
* Memory Controllers are integrated into processors.
  * 6 DDR3 DIMMs per node
  * 3 DDR3 DIMMs per CPU
  * 1 DDR3 DIMMs per channel
  * Data rate support: up to 1600MT/s
* Populated memory: 6 x 16 GB DDR3 DIMM 1600 MHz

### Fat Compute Nodes

* 2 sockets
* Memory Controllers are integrated into processors.
  * 16 DDR3 DIMMs per node
  * 8 DDR3 DIMMs per CPU
  * 2 DDR3 DIMMs per channel
  * Data rate support: up to 1600MT/s
* Populated memory: 16 x 32 GB DDR3 DIMM 1600 MHz
+68 −0
Original line number Diff line number Diff line
# Hardware Overview

The Anselm cluster consists of 209 computational nodes named cn[1-209] of which 180 are regular compute nodes, 23 are GPU Kepler K20 accelerated nodes, 4 are MIC Xeon Phi 5110P accelerated nodes, and 2 are fat nodes. Each node is a powerful x86-64 computer, equipped with 16 cores (two eight-core Intel Sandy Bridge processors), at least 64 GB of RAM, and a local hard drive. User access to the Anselm cluster is provided by two login nodes login[1,2]. The nodes are interlinked through high speed InfiniBand and Ethernet networks. All nodes share a 320 TB /home disk for storage of user files. The 146 TB shared /scratch storage is available for scratch data.

The Fat nodes are equipped with a large amount (512 GB) of memory. Virtualization infrastructure provides resources to run long-term servers and services in virtual mode. Fat nodes and virtual servers may access 45 TB of dedicated block storage. Accelerated nodes, fat nodes, and virtualization infrastructure are available [upon request][a] from a PI.

Schematic representation of the Anselm cluster. Each box represents a node (computer) or storage capacity:

![](../img/Anselm-Schematic-Representation.png)

The cluster compute nodes cn[1-207] are organized within 13 chassis.

There are four types of compute nodes:

* 180 compute nodes without an accelerator
* 23 compute nodes with a GPU accelerator - an NVIDIA Tesla Kepler K20m
* 4 compute nodes with a MIC accelerator - an Intel Xeon Phi 5110P
* 2 fat nodes - equipped with 512 GB of RAM and two 100 GB SSD drives

[More about Compute nodes][1].

GPU and accelerated nodes are available upon request, see the [Resources Allocation Policy][2].

All of these nodes are interconnected through fast InfiniBand and Ethernet networks.  [More about the Network][3].
Every chassis provides an InfiniBand switch, marked **isw**, connecting all nodes in the chassis, as well as connecting the chassis to the upper level switches.

All of the nodes share a 360 TB /home disk for storage of user files. The 146 TB shared /scratch storage is available for scratch data. These file systems are provided by the Lustre parallel file system. There is also local disk storage available on all compute nodes in /lscratch.  [More about Storage][4].

User access to the Anselm cluster is provided by two login nodes login1, login2, and data mover node dm1. [More about accessing the cluster][5].

The parameters are summarized in the following tables:

| **In general**                              |                                              |
| ------------------------------------------- | -------------------------------------------- |
| Primary purpose                             | High Performance Computing                   |
| Architecture of compute nodes               | x86-64                                       |
| Operating system                            | Linux (CentOS)                               |
| [**Compute nodes**][1]                      |                                              |
| Total                                       | 209                                          |
| Processor cores                             | 16 (2 x 8 cores)                             |
| RAM                                         | min. 64 GB, min. 4 GB per core               |
| Local disk drive                            | yes - usually 500 GB                         |
| Compute network                             | InfiniBand QDR, fully non-blocking, fat-tree |
| w/o accelerator                             | 180, cn[1-180]                               |
| GPU accelerated                             | 23, cn[181-203]                              |
| MIC accelerated                             | 4, cn[204-207]                               |
| Fat compute nodes                           | 2, cn[208-209]                               |
| **In total**                                |                                              |
| Total theoretical peak performance  (Rpeak) | 94 TFLOP/s                                   |
| Total max. LINPACK performance  (Rmax)      | 73 TFLOP/s                                   |
| Total amount of RAM                         | 15.136 TB                                    |

| Node             | Processor                               | Memory | Accelerator          |
| ---------------- | --------------------------------------- | ------ | -------------------- |
| w/o accelerator  | 2 x Intel Sandy Bridge E5-2665, 2.4 GHz | 64 GB  | -                    |
| GPU accelerated  | 2 x Intel Sandy Bridge E5-2470, 2.3 GHz | 96 GB  | NVIDIA Kepler K20m   |
| MIC accelerated  | 2 x Intel Sandy Bridge E5-2470, 2.3 GHz | 96 GB  | Intel Xeon Phi 5110P |
| Fat compute node | 2 x Intel Sandy Bridge E5-2665, 2.4 GHz | 512 GB | -                    |

For more details, refer to [Compute nodes][1], [Storage][4], and [Network][3].

[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
[3]: network.md
[4]: storage.md
[5]: ../general/shell-and-data-access.md

[a]: https://support.it4i.cz/rt
+20 −0
Original line number Diff line number Diff line
# Introduction

Welcome to the Anselm supercomputer cluster. The Anselm cluster consists of 209 compute nodes, totaling 3344 compute cores with 15 TB RAM, giving over 94 TFLOP/s theoretical peak performance. Each node is a powerful x86-64 computer, equipped with 16 cores, at least 64 GB of RAM, and a 500 GB hard disk drive. Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network and are equipped with Intel Sandy Bridge processors. A few nodes are also equipped with NVIDIA Kepler GPU or Intel Xeon Phi MIC accelerators. Read more in [Hardware Overview][1].

Anselm runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains. These packages are accessible via the [modules environment][2].

The user data shared file-system (HOME, 320 TB) and job data shared file-system (SCRATCH, 146 TB) are available to users.

The PBS Professional workload manager provides [computing resources allocations and job execution][3].

Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].

[1]: hardware-overview.md
[2]: ../environment-and-modules.md
[3]: ../general/resources-allocation-policy.md
[4]: ../general/applying-for-resources.md
[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[6]: ../general/shell-and-data-access.md

[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
+38 −0
Original line number Diff line number Diff line
# Network

All of the compute and login nodes of Anselm are interconnected through an [InfiniBand][a] QDR network and a Gigabit [Ethernet][b] network. Both networks may be used to transfer user data.

## InfiniBand Network

All of the compute and login nodes of Anselm are interconnected through a high-bandwidth, low-latency [InfiniBand][a] QDR network (IB 4 x QDR, 40 Gbps). The network topology is a fully non-blocking fat-tree.

The compute nodes may be accessed via the InfiniBand network using the ib0 network interface, in address range 10.2.1.1-209. The MPI may be used to establish native InfiniBand connection among the nodes.

!!! note
    The network provides **2170 MB/s** transfer rates via the TCP connection (single stream) and up to **3600 MB/s** via the native InfiniBand protocol.

The Fat tree topology ensures that peak transfer rates are achieved between any two nodes, independent of network traffic exchanged among other nodes concurrently.

## Ethernet Network

The compute nodes may be accessed via the regular Gigabit Ethernet network interface eth0, in the address range 10.1.1.1-209, or by using aliases cn1-cn209. The network provides **114 MB/s** transfer rates via the TCP connection.

## Example

In this example, we access the node cn110 through the InfiniBand network via the ib0 interface, then from cn110 to cn108 through the Ethernet network.

```console
$ qsub -q qexp -l select=4:ncpus=16 -N Name0 ./myjob
$ qstat -n -u username
                                                            Req'd Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time S Time
--------------- -------- --  |---|---| ------ --- --- ------ ----- - -----
15209.srv11     username qexp     Name0        5530   4 64    --  01:00 R 00:00
   cn17/0*16+cn108/0*16+cn109/0*16+cn110/0*16

$ ssh 10.2.1.110
$ ssh 10.1.1.108
```

[a]: http://en.wikipedia.org/wiki/InfiniBand
[b]: http://en.wikipedia.org/wiki/Ethernet

docs.it4i/apiv1.md

0 → 100644
+3 −0
Original line number Diff line number Diff line
# API Placeholder

This page is created automatically from the API source code.

docs.it4i/apiv2.md

0 → 100644
+203 −0
Original line number Diff line number Diff line
# SCS API v2

## Info

- **OpenAPI:** 3.1.0
- **Title:** scs-api-2
- **Version:** 0.1.0
- **Server URL:** `https://scs.it4i.cz/api/v2`

## Paths

### `/dedicated-time`

**GET**

- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries, optionally filtered by cluster name or period preset
- **OperationId:** `dedicated_time_handler`

**Parameters:**

- `cluster` (query): Filter by cluster name; Available values: karolina, barbora, dgx *(optional)*
- `period` (query): Filter by time period preset; Available values: planned, active *(optional)*

**Responses:**

- `200`: List of dedicated time entries
- `400`: Failed to deserialize query, Invalid cluster, Invalid period
  Example:

  ```json
  {
    "message": "Invalid cluster: el_gordo"
  }
  ```
- `500`: Failed to retrieve dedicated time due to a server error
  Example:
  ```json
  {
    "message": "Failed to retreive dedicated time"
  }
  ```

### `/dedicated-time-calendar`

**GET**

- **Summary:** Get dedicated times
- **Description:** Retrieves dedicated time entries and generates a VCalendar response.
- **OperationId:** `dedicated_time_calendar`

**Responses:**

- `200`: Dedicated time VCalendar
  Example:

  ```
  BEGIN:VCALENDAR
  VERSION:2.0
  PRODID:-//SUTD Timetable Calendar//randName//EN
  CALSCALE:GREGORIAN
  BEGIN:VEVENT
  UID:1234@example.com
  DTSTAMP:20230101T000000Z
  DTSTART:20230101T000000Z
  DTEND:20230102T000000Z
  SUMMARY:Sample Dedicated Time - Cluster Outage
  DESCRIPTION:Sample Dedicated Time - Cluster Outage
  END:VEVENT
  END:VCALENDAR
  ```

- `500`: Failed to retrieve dedicated time calendar
  Example:

  ```json
  {
    "message": "Failed to retreive dedicated time calendar"
  }
  ```

### `/motd`

**GET**

- **Summary:** Get messages of the day
- **Description:** Retrieves messages of the day, optionally filtered by category
- **OperationId:** `motd`

**Parameters:**

- `category` (query): *(optional)*

**Responses:**

- `200`: List of motd entries
- `400`: Failed to deserialize query, Invalid motd category
- `500`: Failed to retrieve motd entries due to a server error
  Example:

  ```json
  {
    "message": "Failed to retrieve motd"
  }
  ```

## Components

### Schemas

#### DedicatedTime

```yaml
type: object
required:
  - updated_at
properties:
  cluster_type:
    type: [string, 'null']
  date_efficiency:
    type: [string, 'null']
    format: date-time
  date_expiration:
    type: [string, 'null']
    format: date-time
  updated_at:
    type: string
    format: date-time
```

#### Motd

```yaml
type: object
required:
  - id
  - author
  - category
  - created_at
  - updated_at
  - date_modification
  - title
  - message_body
  - systems
properties:
  id:
    type: integer
    format: int32
    examples: [1]
  author:
    type: string
    examples: [Admin]
  category:
    type: string
    examples: [public-service-announcement]
  created_at:
    type: string
    format: date-time
  updated_at:
    type: string
    format: date-time
  date_modification:
    type: string
    format: date-time
  date_efficiency:
    type: [string, 'null']
    format: date-time
  date_expiration:
    type: [string, 'null']
    format: date-time
  date_outage_efficiency:
    type: [string, 'null']
    format: date-time
  date_outage_expiration:
    type: [string, 'null']
    format: date-time
  title:
    type: string
    examples: [Important Update]
  message_body:
    type: string
    examples: [We are experiencing some service disruptions.]
  systems:
    type: array
    items:
      type: string
      examples: [Karolina]
```

#### MsgResponse

```yaml
type: object
description: |
  Common struct for DTO-less responses
  eg. ```200 {"message":"Operation succeeded"}```
required:
  - message
properties:
  message:
    type: string
    examples: [API response]
```
+50 −0
Original line number Diff line number Diff line
# Introduction

This section contains documentation of decommissioned IT4Innovations' supercomputers and services.

## Salomon

The second supercomputer, built by SGI (now Hewlett Packard Enterprise), was launched in 2015. With a performance of 2 PFlop/s, it was immediately included in the TOP500 list, which ranks the world's most powerful supercomputers. It stayed there until November 2020, falling from the 40th place to 460th.

Salomon was decommissioned after six years - at the end of 2021.

### Interesting Facts

| Salomon's facts              |                    |
| ---------------------------- | ------------------ |
| In operation                 | Q2 2015 - Q4 2021  |
| Theoretical peak performance | 2 PFLOP/s          |
| Number of nodes              | 1,008              |
| HOME storage capacity        | 500 TB             |
| SCRATCH storage capacity     | 1,638 TB           |
| Projects computed            | 1,085              |
| Computing jobs run           | ca. 8,700,000      |
| Corehours used               | ca. 1,014,000,000  |

## Anselm

The first supercomputer, built by Atos, was launched in 2013. For the first 3 years, it was placed in makeshift containers on the campus of VSB – Technical University of Ostrava, and was subsequently moved to the data room of the newly constructed IT4Innovations building. Anselm's computational resources were available to Czech and foreign students and scientists in fields such as material sciences, computational chemistry, biosciences, and engineering.

At the end of January 2021, after more than seven years, its operation permanently ceased. In the future, it will be a part of the [World of Civilization exhibition][a] in Lower Vitkovice.

### Interesting Facts

| Anselm's facts               |                    |
| ---------------------------- | ------------------ |
| Cost                         | 90,000,000 CZK     |
| In operation                 | Q2 2013 - Q1 2021  |
| Theoretical peak performance | 94 TFLOP/s         |
| Number of nodes              | 209                |
| HOME storage capacity        | 320 TB             |
| SCRATCH storage capacity     | 146 TB             |
| Projects computed            | 725                |
| Computing jobs run           | 2,630,567          |
| Corehours used               | 134,130,309        |
| Power consumption            | 77 kW              |

## PRACE

Partnership for Advanced Computing in Europe aims to facilitate the access to a research infrastructure that enables high-impact scientific discovery and engineering research and development across all disciplines to enhance European competitiveness for the benefit of society. For more information, see the [official website][b].

[a]: https://www.dolnivitkovice.cz/en/science-and-technology-centre/exhibitions/
[b]: https://prace-ri.eu/
Original line number Diff line number Diff line
# Hardware Overview

!!!important Work in progress
    Barbora NG documentation is a WIP.
    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

    The launch of Barbora NG is planned for October/November.
    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

Barbora NG consists of 141 non-accelerated compute nodes named **cn[001-141]**.
Each node is a powerful x86-64 computer equipped with 192 cores
(2x Intel Xeon 6952P with 96 CPU cores) and 768 GB RAM.
User access to the Barbora NG cluster is provided by two login nodes **login[1-2]**.
The nodes are interlinked through high speed InfiniBand NDR and Ethernet networks.

The parameters are summarized in the following tables:

| **In general**                       |                       |
| ------------------------------------ | --------------------- |
| Architecture of compute nodes        | x86-64                |
| Operating system                     | Linux                 |
| [**Compute nodes**][1]               |                       |
| Total                                | 141                   |
| Processor Type                       | [Intel Xeon 6952P][b] |
| Architecture                         | Granite Rapids        |
| Processor cores                      | 96                    |
| Processors per node                  | 2                     |
| RAM                                  | 768 GB                |
| Local disk drive                     | no                    |
| Compute network                      | InfiniBand HDR        |
| non-accelerated                      | 141, cn[001-141]        |
| **In total**                         |                       |
| Theoretical peak performance (Rpeak) | ??? TFLOP/s           |
| Cores                                | 27072                 |
| RAM                                  | 108.288 TB            |

[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
[3]: network.md
[4]: storage.md
[5]: ../general/shell-and-data-access.md
[6]: visualization.md

[a]: https://support.it4i.cz/rt
[b]: https://www.intel.com/content/www/us/en/products/sku/241643/intel-xeon-6952p-processor-480m-cache-2-10-ghz/specifications.html
 No newline at end of file
+36 −0
Original line number Diff line number Diff line
# Introduction

!!!important Work in progress
    Barbora NG documentation is a WIP.
    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

    The launch of Barbora NG is planned for October/November.
    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

Welcome to Barbora Next Gen (NG) supercomputer cluster.
Barbora NG is our latest supercomputer which consists of 141 compute nodes,
totaling 27072 compute cores with 108288 GB RAM, giving over ??? TFLOP/s theoretical peak performance.

Nodes are interconnected through a fully non-blocking fat-tree InfiniBand NDR network
and are equipped with Intel Granite Rapids processors.
Read more in [Hardware Overview][1].

The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains.
These packages are accessible via the [modules environment][2].

The user data shared file system and job data shared file system are available to users.

The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].


[1]: hardware-overview.md
[2]: ../environment-and-modules.md
[3]: ../general/resources-allocation-policy.md
[4]: ../general/applying-for-resources.md
[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[6]: ../general/shell-and-data-access.md

[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://slurm.schedmd.com/
+146 −0
Original line number Diff line number Diff line
# Compute Nodes

Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Computing technology.
The cluster contains three types of compute nodes.

## Compute Nodes Without Accelerators

* 192 nodes
* 6912 cores in total
* 2x Intel Cascade Lake 6240, 18-core, 2.6 GHz processors per node
* 192 GB DDR4 2933 MT/s of physical memory per node (12x16 GB)
* BullSequana X1120 blade servers
* 2995.2 GFLOP/s per compute node
* 1x 1 GB Ethernet
* 1x HDR100 IB port
* 3 compute nodes per X1120 blade server
* cn[1-192]

![](img/BullSequanaX1120.png)

## Compute Nodes With a GPU Accelerator

* 8 nodes
* 192 cores in total
* two Intel Skylake Gold 6126, 12-core, 2.6 GHz processors per node
* 192 GB DDR4 2933MT/s with ECC of physical memory per node (12x16 GB)
* 4x GPU accelerator NVIDIA Tesla V100-SXM2 per node
* Bullsequana X410-E5 NVLink-V blade servers
* 1996.8 GFLOP/s per compute nodes
* GPU-to-GPU All-to-All NVLINK 2.0, GPU-Direct
* 1 GB Ethernet
* 2x HDR100 IB ports
* cn[193-200]

![](img/BullSequanaX410E5GPUNVLink.jpg)

## Fat Compute Node

* 1x BullSequana X808 server
* 128 cores in total
* 8 Intel Skylake 8153, 16-core, 2.0 GHz, 125 W
* 6144 GiB DDR4 2667 MT/s of physical memory per node (92x64 GB)
* 2x HDR100 IB port
* 8192 GFLOP/s
* cn[201]

![](img/BullSequanaX808.jpg)

## Compute Node Summary

| Node type                    | Count | Range       | Memory   | Cores         |
| ---------------------------- | ----- | ----------- | -------- | ------------- |
| Nodes without an accelerator | 192   | cn[1-192]   | 192 GB   | 36 @ 2.6 GHz  |
| Nodes with a GPU accelerator | 8     | cn[193-200] | 192 GB   | 24 @ 2.6 GHz  |
| Fat compute nodes            | 1     | cn[201]     | 6144 GiB | 128 @ 2.0 GHz |

## Processor Architecture

Barbora is equipped with Intel Cascade Lake processors Intel Xeon 6240 (nodes without accelerators),
Intel Skylake Gold 6126 (nodes with accelerators) and Intel Skylake Platinum 8153.

### Intel [Cascade Lake 6240][d]

Cascade Lake core is largely identical to that of [Skylake's][a].
For in-depth detail of the Skylake core/pipeline see [Skylake (client) § Pipeline][b].

Xeon Gold 6240 is a 64-bit 18-core x86 multi-socket high performance server microprocessor set to be introduced by Intel in late 2018. This chip supports up to 4-way multiprocessing. The Gold 6240, which is based on the Cascade Lake microarchitecture and is manufactured on a 14 nm process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2.6 GHz with a TDP of 150 W and a turbo boost frequency of up to 3.9 GHz, supports up 1 TB of hexa-channel DDR4-2933 ECC memory.

* **Family**: Xeon Gold
* **Cores**: 18
* **Threads**: 36
* **L1I Cache**: 576 KiB, 18x32 KiB, 8-way set associative
* **L1D Cache**: 576 KiB, 18x32 KiB, 8-way set associative, write-back
* **L2 Cache**: 18 MiB, 18x1 MiB, 16-way set associative, write-back
* **L3 Cache**: 24.75 MiB, 18x1.375 MiB, 11-way set associative, write-back
* **Instructions**: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512 (New instructions for [Vector Neural Network Instructions][c])
* **Frequency**: 2.6 GHz
* **Max turbo**: 3.9 GHz
* **Process**: 14 nm
* **TDP**: 140+ W

### Intel [Skylake Gold 6126][e]

Xeon Gold 6126 is a 64-bit dodeca-core x86 multi-socket high performance server microprocessor introduced by Intel in mid-2017. This chip supports up to 4-way multiprocessing. The Gold 6126, which is based on the server configuration of the Skylake microarchitecture and is manufactured on a 14 nm+ process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2.6 GHz with a TDP of 125 W and a turbo boost frequency of up to 3.7 GHz, supports up to 768 GiB of hexa-channel DDR4-2666 ECC memory.

* **Family**: Xeon Gold
* **Cores**: 12
* **Threads**: 24
* **L1I Cache**: 384 KiB, 12x32 KiB, 8-way set associative
* **L1D Cache**: 384 KiB,	12x32 KiB, 8-way set associative, write-back
* **L2 Cache**: 12 MiB, 12x1 MiB, 16-way set associative,	write-back
* **L3 Cache**: 19.25 MiB, 14x1.375 MiB, 11-way set associative, write-back
* **Instructions**: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512
* **Frequency**: 2.6 GHz
* **Max turbo**: 3.7 GHz
* **Process**: 14 nm
* **TDP**: 125 W

### Intel [Skylake Platinum 8153][f]

Xeon Platinum 8153 is a 64-bit 16-core x86 multi-socket highest performance server microprocessor introduced by Intel in mid-2017. This chip supports up to 8-way multiprocessing. The Platinum 8153, which is based on the server configuration of the Skylake microarchitecture and is manufactured on a 14 nm+ process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2 GHz with a TDP of 125 W and a turbo boost frequency of up to 2.8 GHz, supports up to 768 GiB of hexa-channel DDR4-2666 ECC memory.

* **Family**: Xeon Platinum
* **Cores**: 16
* **Threads**: 32
* **L1I Cache**: 512 KiB, 16x32 KiB, 8-way set associative
* **L1D Cache**: 512 KiB, 16x32 KiB, 8-way set associative, write-back
* **L2 Cache**: 16 MiB, 16x1 MiB, 16-way set associative, write-back
* **L3 Cache**: 22 MiB, 16x1.375 MiB, 11-way set associative, write-back
* **Instructions**: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512
* **Frequency**: 2.0 GHz
* **Max turbo**: 2.8 GHz
* **Process**: 14 nm
* **TDP**: 125 W

## GPU Accelerator

Barbora is equipped with an [NVIDIA Tesla V100-SXM2][g] accelerator.

![](img/gpu-v100.png)

| NVIDIA Tesla V100-SXM2       |                                        |
| ---------------------------- | -------------------------------------- |
| GPU Architecture             | NVIDIA Volta                           |
| NVIDIA Tensor Cores          | 640                                    |
| NVIDIA CUDA® Cores           | 5120                                   |
| Double-Precision Performance | 7.8 TFLOP/s                             |
| Single-Precision Performance | 15.7 TFLOP/s                            |
| Tensor Performance           | 125 TFLOP/s                             |
| GPU Memory                   | 16 GB HBM2                              |
| Memory Bandwidth             | 900 GB/sec                              |
| ECC                          | Yes                                    |
| Interconnect Bandwidth       | 300 GB/sec                              |
| System Interface             | NVIDIA NVLink                          |
| Form Factor                  | SXM2                                   |
| Max Power Consumption        | 300 W                                   |
| Thermal Solution             | Passive                                |
| Compute APIs                 | CUDA, DirectCompute, OpenCLTM, OpenACC |

[a]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)#Core
[b]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Pipeline
[c]: https://en.wikichip.org/wiki/x86/avx512vnni
[d]: https://en.wikichip.org/wiki/intel/xeon_gold/6240
[e]: https://en.wikichip.org/wiki/intel/xeon_gold/6126
[f]: https://en.wikichip.org/wiki/intel/xeon_platinum/8153
[g]: https://images.nvidia.com/content/technologies/volta/pdf/tesla-volta-v100-datasheet-letter-fnl-web.pdf
+67 −0
Original line number Diff line number Diff line
# Hardware Overview

The Barbora cluster consists of 201 computational nodes named **cn[001-201]**
of which 192 are regular compute nodes, 8 are GPU Tesla V100 accelerated nodes and 1 is a fat node.
Each node is a powerful x86-64 computer, equipped with 36/24/128 cores
(18-core Intel Cascade Lake 6240 / 12-core Intel Skylake Gold 6126 / 16-core Intel Skylake 8153), at least 192 GB of RAM.
User access to the Barbora cluster is provided by two login nodes **login[1,2]**.
The nodes are interlinked through high speed InfiniBand and Ethernet networks.

The fat node is equipped with 6144 GB of memory.
Virtualization infrastructure provides resources for running long-term servers and services in virtual mode.
The Accelerated nodes, fat node, and virtualization infrastructure are available [upon request][a] from a PI.

**There are three types of compute nodes:**

* 192 compute nodes without an accelerator
* 8 compute nodes with a GPU accelerator - 4x NVIDIA Tesla V100-SXM2
* 1 fat node - equipped with 6144 GB of RAM

[More about compute nodes][1].

GPU and accelerated nodes are available upon request, see the [Resources Allocation Policy][2].

All of these nodes are interconnected through fast InfiniBand and Ethernet networks.
[More about the computing network][3].
Every chassis provides an InfiniBand switch, marked **isw**, connecting all nodes in the chassis,
as well as connecting the chassis to the upper level switches.

User access to Barbora is provided by two login nodes: login1 and login2.
[More about accessing the cluster][5].

The parameters are summarized in the following tables:

| **In general**                              |                                              |
| ------------------------------------------- | -------------------------------------------- |
| Primary purpose                             | High Performance Computing                   |
| Architecture of compute nodes               | x86-64                                       |
| Operating system                            | Linux                                        |
| [**Compute nodes**][1]                      |                                              |
| Total                                       | 201                                          |
| Processor cores                             | 36/24/128 (2x18 cores/2x12 cores/8x16 cores) |
| RAM                                         | min. 192 GB                                  |
| Local disk drive                            | no                                           |
| Compute network                             | InfiniBand HDR                               |
| w/o accelerator                             | 192, cn[001-192]                             |
| GPU accelerated                             | 8, cn[193-200]                               |
| Fat compute nodes                           | 1, cn[201]                                   |
| **In total**                               |                                             |
| Total theoretical peak performance  (Rpeak) | 848.8448 TFLOP/s                             |
| Total amount of RAM                         | 44.544 TB                                    |

| Node             | Processor                               | Memory | Accelerator            |
| ---------------- | --------------------------------------- | ------ | ---------------------- |
| Regular node     | 2x Intel Cascade Lake 6240, 2.6 GHz     | 192GB  | -                      |
| GPU accelerated  | 2x Intel Skylake Gold 6126, 2.6 GHz     | 192GB  | NVIDIA Tesla V100-SXM2 |
| Fat compute node | 2x Intel Skylake Platinum 8153, 2.0 GHz | 6144GB | -                      |

For more details refer to the sections [Compute Nodes][1], [Storage][4], [Visualization Servers][6], and [Network][3].

[1]: compute-nodes.md
[2]: ../general/resources-allocation-policy.md
[3]: network.md
[4]: storage.md
[5]: ../general/shell-and-data-access.md
[6]: visualization.md

[a]: https://support.it4i.cz/rt
+25 −0
Original line number Diff line number Diff line
# Introduction

Welcome to Barbora supercomputer cluster. The Barbora cluster consists of 201 compute nodes, totaling 7232 compute cores with 44544 GB RAM, giving over 848 TFLOP/s theoretical peak performance.

Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network, and are equipped with Intel Cascade Lake processors. A few nodes are also equipped with NVIDIA Tesla V100-SXM2. Read more in [Hardware Overview][1].

The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains. These packages are accessible via the [modules environment][2].

The user data shared file system and job data shared file system are available to users.

The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].

![](img/BullSequanaX.png)

[1]: hardware-overview.md
[2]: ../environment-and-modules.md
[3]: ../general/resources-allocation-policy.md
[4]: ../general/applying-for-resources.md
[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[6]: ../general/shell-and-data-access.md

[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
[b]: https://slurm.schedmd.com/
+52 −0
Original line number Diff line number Diff line
# Network

All of the compute and login nodes of Barbora are interconnected through a [InfiniBand][a] HDR 200 Gbps network and a Gigabit Ethernet network.

Compute nodes and the service infrastructure is connected by the HDR100 technology
that allows one 200 Gbps HDR port (aggregation 4x 50 Gbps) to be divided into two HDR100 ports with 100 Gbps (2x 50 Gbps) bandwidth.

The cabling between the L1 and L2 layer is realized by HDR cabling,
connecting the end devices is realized by so called Y or splitter cable (1x HRD200 - 2x HDR100).

![](img/hdr.jpg)

**The computing network thus implemented fulfills the following parameters**

* 100Gbps
* Latencies less than 10 microseconds (0.6 μs end-to-end, <90ns switch hop)
* Adaptive routing support
* MPI communication support
* IP protocol support (IPoIB)
* Support for SCRATCH Data Storage and NVMe over Fabric Data Storage.

## Mellanox QM8700 40-Ports Switch

**Performance**

* 40x HDR 200 Gb/s ports in a 1U switch
* 80x HDR100 100 Gb/s ports in a 1U switch
* 16 Tb/s aggregate switch throughput
* Up to 15.8 billion messages-per-second
* 90ns switch latency

**Optimized Design**

* 1+1 redundant & hot-swappable power
* 80 gold+ and energy star certified power supplies
* Dual-core x86 CPU

**Advanced Design**

* Adaptive routing
* Collective offloads (Mellanox SHARP technology)
* VL mapping (VL2VL)

![](img/QM8700.jpg)

## BullSequana XH2000 HDRx WH40 MODULE

* Mellanox QM8700 switch modified for direct liquid cooling (Atos Cold Plate), with form factor for installing the Bull Sequana XH2000 rack

![](img/XH2000.png)

[a]: http://en.wikipedia.org/wiki/InfiniBand
+224 −0
Original line number Diff line number Diff line
# Storage

There are three main shared file systems on the Barbora cluster: [HOME][1], [SCRATCH][2], and [PROJECT][5]. All login and compute nodes may access same data on shared file systems. Compute nodes are also equipped with local (non-shared) scratch, RAM disk, and tmp file systems.

## Archiving

Do not use shared filesystems as a backup for large amount of data or long-term archiving mean. The academic staff and students of research institutions in the Czech Republic can use [CESNET storage service][3], which is available via SSHFS.

## Shared Filesystems

Barbora computer provides three main shared filesystems, the [HOME filesystem][1], [SCRATCH filesystem][2], and the [PROJECT filesystems][5].

All filesystems are accessible via the Infiniband network.

The HOME and PROJECT filesystems are realized as NFS filesystem.

The SCRATCH filesystem is realized as a parallel Lustre filesystem.

Extended ACLs are provided on both Lustre filesystems for sharing data with other users using fine-grained control

### Understanding the Lustre Filesystems

A user file on the [Lustre filesystem][a] can be divided into multiple chunks (stripes) and stored across a subset of the object storage targets (OSTs) (disks). The stripes are distributed among the OSTs in a round-robin fashion to ensure load balancing.

When a client (a compute node from your job) needs to create or access a file, the client queries the metadata server (MDS) and the metadata target (MDT) for the layout and location of the [file's stripes][b]. Once the file is opened and the client obtains the striping information, the MDS is no longer involved in the file I/O process. The client interacts directly with the object storage servers (OSSes) and OSTs to perform I/O operations such as locking, disk allocation, storage, and retrieval.

If multiple clients try to read and write the same part of a file at the same time, the Lustre distributed lock manager enforces coherency, so that all clients see consistent results.

There is default stripe configuration for Barbora Lustre filesystems. However, users can set the following stripe parameters for their own directories or files to get optimum I/O performance:

1. `stripe_size` the size of the chunk in bytes; specify with k, m, or g to use units of KB, MB, or GB, respectively; the size must be an even multiple of 65,536 bytes; default is 1MB for all Barbora Lustre filesystems
1. `stripe_count` the number of OSTs to stripe across; default is 1 for Barbora Lustre filesystems one can specify -1 to use all OSTs in the filesystem.
1. `stripe_offset` the index of the OST where the first stripe is to be placed; default is -1 which results in random selection; using a non-default value is NOT recommended.

!!! note
    Setting stripe size and stripe count correctly for your needs may significantly affect the I/O performance.

Use the `lfs getstripe` command for getting the stripe parameters. Use `lfs setstripe` for setting the stripe parameters to get optimal I/O performance. The correct stripe setting depends on your needs and file access patterns.

```console
$ lfs getstripe dir|filename
$ lfs setstripe -s stripe_size -c stripe_count -o stripe_offset dir|filename
```

Example:

```console
$ lfs getstripe /scratch/projname
$ lfs setstripe -c -1 /scratch/projname
$ lfs getstripe /scratch/projname
```

In this example, we view the current stripe setting of the /scratch/projname/ directory. The stripe count is changed to all OSTs and verified. All files written to this directory will be striped over 5 OSTs

Use `lfs check osts` to see the number and status of active OSTs for each filesystem on Barbora. Learn more by reading the man page:

```console
$ lfs check osts
$ man lfs
```

### Hints on Lustre Stripping

!!! note
    Increase the `stripe_count` for parallel I/O to the same file.

When multiple processes are writing blocks of data to the same file in parallel, the I/O performance for large files will improve when the `stripe_count` is set to a larger value. The stripe count sets the number of OSTs to which the file will be written. By default, the stripe count is set to 1. While this default setting provides for efficient access of metadata (for example to support the `ls -l` command), large files should use stripe counts of greater than 1. This will increase the aggregate I/O bandwidth by using multiple OSTs in parallel instead of just one. A rule of thumb is to use a stripe count approximately equal to the number of gigabytes in the file.

Another good practice is to make the stripe count be an integral factor of the number of processes performing the write in parallel, so that you achieve load balance among the OSTs. For example, set the stripe count to 16 instead of 15 when you have 64 processes performing the writes.

!!! note
    Using a large stripe size can improve performance when accessing very large files

Large stripe size allows each client to have exclusive access to its own part of a file. However, it can be counterproductive in some cases if it does not match your I/O pattern. The choice of stripe size has no effect on a single-stripe file.

Read more [here][c].

### Lustre on Barbora

The architecture of Lustre on Barbora is composed of two metadata servers (MDS) and two data/object storage servers (OSS).

 Configuration of the SCRATCH storage

* 2x Metadata server
* 2x Object storage server
* Lustre object storage
  * One disk array NetApp E2800
  * 54x 8TB 10kRPM 2,5” SAS HDD
  * 5 x RAID6(8+2) OST Object storage target
  * 4 hotspare
* Lustre metadata storage
  * One disk array NetApp E2600
  * 12 300GB SAS 15krpm disks
  * 2 groups of 5 disks in RAID5 Metadata target
  * 2 hot-spare disks

### HOME File System

The HOME filesystem is mounted in directory /home. Users home directories /home/username reside on this filesystem. Accessible capacity is 28TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 25GB per user. Should 25GB prove insufficient, contact [support][d], the quota may be lifted upon request.

!!! note
    The HOME filesystem is intended for preparation, evaluation, processing and storage of data generated by active Projects.

The HOME filesystem should not be used to archive data of past Projects or other unrelated data.

The files on HOME filesystem will not be deleted until the end of the [user's lifecycle][4].

The filesystem is backed up, so that it can be restored in case of a catastrophic failure resulting in significant data loss. However, this backup is not intended to restore old versions of user data or to restore (accidentally) deleted files.

| HOME filesystem      |                 |
| -------------------- | --------------- |
| Accesspoint          | /home/username  |
| Capacity             | 28TB           |
| Throughput           | 1GB/s          |
| User space quota     | 25GB           |
| User inodes quota    | 500K           |
| Protocol             | NFS             |

### SCRATCH File System

The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.

The SCRATCH filesystem is mounted in the `/scratch/project/PROJECT_ID` directory created automatically with the `PROJECT_ID` project. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.

!!! note
    The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.

    Users are advised to save the necessary data from the SCRATCH filesystem to HOME filesystem after the calculations and clean up the scratch files.

!!! warning
    Files on the SCRATCH filesystem that are **not accessed for more than 90 days** will be automatically **deleted**.

The SCRATCH filesystem is realized as Lustre parallel filesystem and is available from all login and computational nodes. Default stripe size is 1MB, stripe count is 1. There are 5 OSTs dedicated for the SCRATCH filesystem.

!!! note
    Setting stripe size and stripe count correctly for your needs may significantly affect the I/O performance.

| SCRATCH filesystem   |           |
| -------------------- | --------- |
| Mountpoint           | /scratch  |
| Capacity             | 310TB     |
| Throughput           | 5GB/s     |
| Throughput [Burst]   | 38GB/s    |
| User space quota     | 10TB      |
| User inodes quota    | 10M       |
| Default stripe size  | 1MB       |
| Default stripe count | 1         |
| Number of OSTs       | 5         |

### PROJECT File System

The PROJECT data storage is a central storage for projects'/users' data on IT4Innovations that is accessible from all clusters.
For more information, see the [PROJECT storage][6] section.

### Disk Usage and Quota Commands

Disk usage and user quotas can be checked and reviewed using the `it4ifsusage` command. You can see an example output [here][9].

To have a better understanding of where the space is exactly used, you can use following command:

```console
$ du -hs dir
```

Example for your HOME directory:

```console
$ cd /home
$ du -hs * .[a-zA-z0-9]* | grep -E "[0-9]*G|[0-9]*M" | sort -hr
258M     cuda-samples
15M      .cache
13M      .mozilla
5,5M     .eclipse
2,7M     .idb_13.0_linux_intel64_app
```

This will list all directories with MegaBytes or GigaBytes of consumed space in your actual (in this example HOME) directory. List is sorted in descending order from largest to smallest files/directories.

### Extended ACLs

Extended ACLs provide another security mechanism beside the standard POSIX ACLs, which are defined by three entries (for owner/group/others). Extended ACLs have more than the three basic entries. In addition, they also contain a mask entry and may contain any number of named user and named group entries.

ACLs on a Lustre file system work exactly like ACLs on any Linux file system. They are manipulated with the standard tools in the standard manner.

For more information, see the [Access Control List][7] section of the documentation.

## Local Filesystems

### TMP

Each node is equipped with local /tmp RAMDISK directory. The /tmp directory should be used to work with  temporary files. Old files in /tmp directory are automatically purged.

### SCRATCH and RAMDISK

Each node is equipped with RAMDISK storage accessible at /tmp, /lscratch and /ramdisk. The RAMDISK capacity is 180GB. Data placed on RAMDISK occupies the node RAM memory of 192GB total. The RAMDISK directory should only be used to work with  temporary files, where very high throughput or I/O performance is required. Old files in RAMDISK directory are automatically purged with job's end.

#### Global RAM Disk

The Global RAM disk spans the local RAM disks of all the allocated nodes within a single job.
For more information, see the [Job Features][8] section.

## Summary

| Mountpoint | Usage                     | Protocol | Net Capacity     | Throughput                     | Limitations | Access                   | Services                        |
| ---------- | ------------------------- | -------- | --------------   | ------------------------------ | ----------- | -----------------------  | ------------------------------- |
| /home      | home directory            | NFS      | 28TB             | 1GB/s                          | Quota 25GB  | Compute and login nodes  | backed up                       |
| /scratch   | scratch temoporary        | Lustre   | 310TB            | 5GB/s, 30GB/s burst buffer     | Quota 10TB  | Compute and login nodes  |files older 90 days autoremoved |
| /lscratch  | local scratch ramdisk     | tmpfs    | 180GB           | 130GB/s                         | none        | Node local               | auto purged after job end       |

[1]: #home-file-system
[2]: #scratch-file-system
[3]: ../storage/cesnet-storage.md
[4]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[5]: #project-file-system
[6]: ../storage/project-storage.md
[7]: ../storage/standard-file-acl.md
[8]: ../job-features.md#global-ram-disk
[9]: ../storage/project-storage.md#project-quotas

[a]: http://www.nas.nasa.gov
[b]: http://www.nas.nasa.gov/hecc/support/kb/Lustre_Basics_224.html#striping
[c]: http://doc.lustre.org/lustre_manual.xhtml#managingstripingfreespace
[d]: https://support.it4i.cz/rt
[e]: http://man7.org/linux/man-pages/man1/nfs4_setfacl.1.html
+50 −0
Original line number Diff line number Diff line
# Visualization Servers

Remote visualization with [VirtualGL][3] is available on two nodes.

* 2 nodes
* 32 cores in total
* 2x Intel Skylake Gold 6130 – 16-core@2,1 GHz processors per node
* 192 GB DDR4 2667 MT/s of physical memory per node (12x 16 GB)
* BullSequana X450-E5 blade servers
* 2150.4 GFLOP/s per compute node
* 1x 1 GB Ethernet and 2x 10 GB Ethernet
* 1x HDR100 IB port
* 2x SSD 240 GB

![](img/bullsequanaX450-E5.png)

## NVIDIA Quadro P6000

* GPU Memory: 24 GB GDDR5X
* Memory Interface: 384-bit
* Memory Bandwidth: Up to 432 GB/s
* NVIDIA CUDA® Cores: 3840
* System Interface: PCI Express 3.0 x16
* Max Power Consumption: 250 W
* Thermal Solution: Active
* Form Factor: 4.4”H x 10.5” L, Dual Slot, Full Height
* Display Connectors: 4x DP 1.4 + DVI-D DL
* Max Simultaneous Displays: 4 direct, 4 DP1.4 Multi-Stream
* Max DP 1.4 Resolution: 7680 x 4320 @ 30 Hz
* Max DVI-D DL Resolution: 2560 x 1600 @ 60 Hz
* Graphics APIs: Shader Model 5.1, OpenGL 4.5, DirectX 12.0, Vulkan 1.0,
* Compute APIs: CUDA, DirectCompute, OpenCL™
* Floating-Point Performance-Single Precision: 12.6 TFLOP/s, Peak

![](img/quadrop6000.jpg)

## Resource Allocation Policy

| queue | active project | project resources | nodes | min ncpus | priority | authorization | walltime |
|-------|----------------|-------------------|-------|-----------|----------|---------------|----------|
| qviz Visualization queue | yes | none required | 2 | 4 | 150 | no | 1h/8h |

## References

* [Graphical User Interface][1]
* [VPN Access][2]

[1]: ../general/shell-and-data-access.md#graphical-user-interface
[2]: ../general/shell-and-data-access.md#vpn-access
[3]: ../software/viz/vgl.md
+76 −0
Original line number Diff line number Diff line
# e-INFRA CZ Cloud Ostrava

Ostrava cloud consists of 22 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.

## Access

To acces the cloud you must:

* have an [e-Infra CZ account][3],
* be a member of an [active project][b].

The dashboard is available at [https://ostrava.openstack.cloud.e-infra.cz/][6].

You can specify resources/quotas for your project.
For more information, see the [Quota Limits][5] section.

## Creating First Instance

To create your first VM instance, follow the [e-INFRA CZ guide][4].
Note that the guide is similar for clouds in Brno and Ostrava,
so make sure that you follow steps for Ostrava cloud where applicable.

### Process Automatization

You can automate the process using Terraform or Openstack.

#### Terraform

Prerequisites:

* Linux/Mac/WSL terminal BASH shell
* installed Terraform and sshuttle
* downloaded [application credentials][9] from OpenStack Horizon dashboard and saved as a `project_openrc.sh.inc` text file

Follow the guide: [https://code.it4i.cz/terraform][8]

#### OpenStack

Prerequisites:

* Linux/Mac/WSL terminal BASH shell
* installed [OpenStack client][7]

Follow the guide: [https://code.it4i.cz/commandline][10]

Run commands:

```console
source project_openrc.sh.inc
```

```console
./cmdline-demo.sh basic-infrastructure-1
```

## Technical Reference

For the list of deployed OpenStack services, see the [list of components][1].

More information can be found on the [e-INFRA CZ website][2].

[1]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/openstack-components
[2]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site
[3]: https://docs.account.e-infra.cz/en/docs/access/account#how-to-apply-for-the-first-time
[4]: https://docs.platforms.cloud.e-infra.cz/en/docs/getting-started/creating-first-infrastructure
[5]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/quota-limits
[6]: https://ostrava.openstack.cloud.e-infra.cz/
[7]: https://cyso.cloud/docs/cloud/extra/how-to-use-the-openstack-cli-tools-on-linux/
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.platforms.cloud.e-infra.cz/en/docs/how-to-guides/obtaining-api-key
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline

[a]: ../karolina/introduction.md
[b]: ../general/access/project-access.md
+143 −0
Original line number Diff line number Diff line
# IT4I Cloud

IT4I cloud consists of 14 nodes from the [Karolina][a] supercomputer.
The cloud site is built on top of OpenStack,
which is a free open standard cloud computing platform.

!!! Note
    The guide describes steps for personal projects.<br>
    Some steps may differ for large projects.<br>
    For large project, apply for resources to the [Allocation Committee][11].

## Access

To access the cloud you must be a member of an active EUROHPC project,
or fall into the **Access Category B**, i.e. [Access For Thematic HPC Resource Utilisation][11].

A personal OpenStack project is required. Request one by contacting [IT4I Support][12].

The dashboard is available at [https://cloud.it4i.cz][6].

You can see quotas set for the IT4I Cloud in the [Quota Limits][f] section.

## Creating First Instance

To create your first VM instance, follow the steps below:

### Log In

Go to [https://cloud.it4i.cz][6], enter your LDAP username and password and choose the `IT4I_LDAP` domain. After you sign in, you will be redirected to the dashboard.

![](../img/login.png)

### Create Key Pair

SSH key is required for remote access to your instance.

1. Go to **Project > Compute > Key Pairs** and click the **Create Key Pair** button.

    ![](../img/keypairs.png)

1. In the Create Key Pair window, name your key pair, select `SSH Key` for key type and confirm by clicking Create Key Pair.

    ![](../img/keypairs1.png)

1. Download and manage the private key according to your operating system.

### Update Security Group

To be able to remotely access your VM instance, you have to allow access in the security group.

1. Go to **Project > Network > Security Groups** and click on **Manage Rules**, for the default security group.

    ![](../img/securityg.png)

1. Click on **Add Rule**, choose **SSH**, and leave the remaining fields unchanged.

    ![](../img/securityg1.png)

### Create VM Instance

1. In **Compute > Instances**, click **Launch Instance**.

    ![](../img/instance.png)

1. Choose Instance Name, Description, and number of instances. Click **Next**.

    ![](../img/instance1.png)

1. Choose an image from which to boot the instance. Choose to delete the volume after instance delete. Click **Next**.

    ![](../img/instance2.png)

1. Choose the hardware resources of the instance by selecting a flavor. Additional volumes for data can be attached later on. Click **Next**.

    ![](../img/instance3.png)

1. Select the network and continue to **Security Groups**.

    ![](../img/instance4.png)

1. Allocate the security group with SSH rule that you added in the [Update Security Group](it4i-cloud.md#update-security-group) step. Then click **Next** to go to the **Key Pair**.

    ![](../img/securityg2.png)

1. Select the key that you created in the [Create Key Pair][g] section and launch the instance.

    ![](../img/instance5.png)

### Associate Floating IP

1. Click on the **Associate** button next to the floating IP.

    ![](../img/floatingip.png)

1. Select Port to be associated with the instance, then click the **Associate** button.

Now you can join the VM using your preferred SSH client.

## Process Automatization

You can automate the process using Openstack.

### OpenStack

Prerequisites:

* Linux/Mac/WSL terminal BASH shell
* installed [OpenStack client][7]

Follow the guide: [https://code.it4i.cz/commandline][10]

Run commands:

```console
source project_openrc.sh.inc
```

```console
./cmdline-demo.sh basic-infrastructure-1
```

[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
[3]: https://docs.e-infra.cz/account/
[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-g2-site/quota-limits/
[6]: https://cloud.it4i.cz
[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
[11]: https://www.it4i.cz/en/for-users/computing-resources-allocation
[12]: mailto:support@it4i.cz @@

[a]: ../karolina/introduction.md
[b]: ../general/access/project-access.md
[c]: einfracz-cloud.md
[d]: ../general/accessing-the-clusters/vpn-access.md
[e]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
[f]: it4i-quotas.md
[g]: it4i-cloud.md#create-key-pair

+31 −0
Original line number Diff line number Diff line
# IT4I Cloud Quotas

| Resource                              | Quota |
|---------------------------------------|-------|
| Instances                             |    10 |
| VCPUs                                 |    20 |
| RAM                                   |  32GB |
| Volumes                               |    20 |
| Volume Snapshots                      |    12 |
| Volume Storage                        |   500 |
| Floating-IPs                          |     1 |
| Security Groups                       |    10 |
| Security Group Rules                  |   100 |
| Networks                              |     1 |
| Ports                                 |    10 |
| Routers                               |     1 |
| Backups                               |    12 |
| Groups                                |    10 |
| rbac_policies                         |    10 |
| Subnets                               |     1 |
| Subnet_pools                          |    -1 |
| Fixed-ips                             |    -1 |
| Injected-file-size                    | 10240 |
| Injected-path-size                    |   255 |
| Injected-files                        |     5 |
| Key-pairs                             |   100 |
| Properties                            |   128 |
| Server-groups                         |    10 |
| Server-group-members                  |    10 |
| Backup-gigabytes                      |  1002 |
| Per-volume-gigabytes                  |    -1 |

docs.it4i/config.yml

0 → 100644
+17 −0
Original line number Diff line number Diff line
host: irods.it4i.cz
port: 1247
proxy_user: some_user
client_user: some_user
zone: IT4I

authscheme: "pam"
ssl_ca_cert_file: "~/.irods/chain_geant_ov_rsa_ca_4_full.pem"
ssl_encryption_key_size: 32
ssl_encryption_algorithm: "AES-256-CBC"
ssl_encryption_salt_size: 8
ssl_encryption_hash_rounds: 16

path_mappings:
  - irods_path: /IT4I/home/some_user
    mapping_path: /
    resource_type: dir
+22 −0
Original line number Diff line number Diff line
# Accessing Complementary Systems

Complementary systems can be accessed at `login.cs.it4i.cz`
by any user with an active account assigned to an active project.

**SSH is required** to access Complementary systems.

## Data Storage

### Home

The `/home` file system is shared across all Complementary systems. Note that this file system is **not** shared with the file system on IT4I clusters.

### Scratch

There are local `/lscratch` storages on individual nodes.

### PROJECT

Complementary systems are connected to the [PROJECT storage][1].

[1]: ../storage/project-storage.md
+558 −0
Original line number Diff line number Diff line
# Using AMD Partition

For testing your application on the AMD partition,
you need to prepare a job script for that partition or use the interactive job:

```console
salloc -N 1 -c 64 -A PROJECT-ID -p p03-amd --gres=gpu:4 --time=08:00:00
```

where:

- `-N 1` means allocating one server,
- `-c 64` means allocating 64 cores,
- `-A` is your project,
- `-p p03-amd` is AMD partition,
- `--gres=gpu:4` means allocating all 4 GPUs of the node,
- `--time=08:00:00` means allocation for 8 hours.

You have also an option to allocate subset of the resources only,
by reducing the `-c` and `--gres=gpu` to smaller values.

```console
salloc -N 1 -c 48 -A PROJECT-ID -p p03-amd --gres=gpu:3 --time=08:00:00
salloc -N 1 -c 32 -A PROJECT-ID -p p03-amd --gres=gpu:2 --time=08:00:00
salloc -N 1 -c 16 -A PROJECT-ID -p p03-amd --gres=gpu:1 --time=08:00:00
```

!!! Note
    p03-amd01 server has hyperthreading **enabled** therefore htop shows 128 cores.<br>
    p03-amd02 server has hyperthreading **disabled** therefore htop shows 64 cores.

## Using AMD MI100 GPUs

The AMD GPUs can be programmed using the [ROCm open-source platform](https://docs.amd.com/).

ROCm and related libraries are installed directly in the system.
You can find it here:

```console
/opt/rocm/
```

The actual version can be found here:

```console
[user@p03-amd02.cs]$ cat /opt/rocm/.info/version

5.5.1-74
```

## Basic HIP Code

The first way how to program AMD GPUs is to use HIP.

The basic vector addition code in HIP looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `vector_add.hip.cpp`.

```console
#include <cstdio>
#include <hip/hip_runtime.h>



__global__ void add_vectors(float * x, float * y, float alpha, int count)
{
    long long idx = blockIdx.x * blockDim.x + threadIdx.x;

    if(idx < count)
        y[idx] += alpha * x[idx];
}

int main()
{
    // number of elements in the vectors
    long long count = 10;

    // allocation and initialization of data on the host (CPU memory)
    float * h_x = new float[count];
    float * h_y = new float[count];
    for(long long i = 0; i < count; i++)
    {
        h_x[i] = i;
        h_y[i] = 10 * i;
    }

    // print the input data
    printf("X:");
    for(long long i = 0; i < count; i++)
        printf(" %7.2f", h_x[i]);
    printf("\n");
    printf("Y:");
    for(long long i = 0; i < count; i++)
        printf(" %7.2f", h_y[i]);
    printf("\n");

    // allocation of memory on the GPU device
    float * d_x;
    float * d_y;
    hipMalloc(&d_x, count * sizeof(float));
    hipMalloc(&d_y, count * sizeof(float));

    // copy the data from host memory to the device
    hipMemcpy(d_x, h_x, count * sizeof(float), hipMemcpyHostToDevice);
    hipMemcpy(d_y, h_y, count * sizeof(float), hipMemcpyHostToDevice);

    int tpb = 256;
    int bpg = (count - 1) / tpb + 1;
    // launch the kernel on the GPU
    add_vectors<<< bpg, tpb >>>(d_x, d_y, 100, count);
    // hipLaunchKernelGGL(add_vectors, bpg, tpb, 0, 0, d_x, d_y, 100, count);

    // copy the result back to CPU memory
    hipMemcpy(h_y, d_y, count * sizeof(float), hipMemcpyDeviceToHost);

    // print the results
    printf("Y:");
    for(long long i = 0; i < count; i++)
        printf(" %7.2f", h_y[i]);
    printf("\n");

    // free the allocated memory
    hipFree(d_x);
    hipFree(d_y);
    delete[] h_x;
    delete[] h_y;

    return 0;
}
```

To compile the code we use `hipcc` compiler.
For compiler information, use `hipcc --version`:

```console
[user@p03-amd02.cs ~]$ hipcc --version

HIP version: 5.5.30202-eaf00c0b
AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.5.1 23194 69ef12a7c3cc5b0ccf820bc007bd87e8b3ac3037)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/rocm-5.5.1/llvm/bin
```

The code is compiled a follows:

```console
hipcc vector_add.hip.cpp -o vector_add.x
```

The correct output of the code is:

```console
[user@p03-amd02.cs ~]$ ./vector_add.x
X:    0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00
Y:    0.00   10.00   20.00   30.00   40.00   50.00   60.00   70.00   80.00   90.00
Y:    0.00  110.00  220.00  330.00  440.00  550.00  660.00  770.00  880.00  990.00
```

More details on HIP programming is in the [HIP Programming Guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.5/page/Introduction_to_HIP_Programming_Guide.html)

## HIP and ROCm Libraries

The list of official AMD libraries can be found [here](https://docs.amd.com/category/libraries).

The libraries are installed in the same directory is ROCm

```console
/opt/rocm/
```

Following libraries are installed:

```console
drwxr-xr-x  4 root root   44 Jun  7 14:09 hipblas
drwxr-xr-x  3 root root   17 Jun  7 14:09 hipblas-clients
drwxr-xr-x  3 root root   29 Jun  7 14:09 hipcub
drwxr-xr-x  4 root root   44 Jun  7 14:09 hipfft
drwxr-xr-x  3 root root   25 Jun  7 14:09 hipfort
drwxr-xr-x  4 root root   32 Jun  7 14:09 hiprand
drwxr-xr-x  4 root root   44 Jun  7 14:09 hipsolver
drwxr-xr-x  4 root root   44 Jun  7 14:09 hipsparse
```

and

```console
drwxr-xr-x  4 root root   32 Jun  7 14:09 rocalution
drwxr-xr-x  4 root root   44 Jun  7 14:09 rocblas
drwxr-xr-x  4 root root   44 Jun  7 14:09 rocfft
drwxr-xr-x  4 root root   32 Jun  7 14:09 rocprim
drwxr-xr-x  4 root root   32 Jun  7 14:09 rocrand
drwxr-xr-x  4 root root   44 Jun  7 14:09 rocsolver
drwxr-xr-x  4 root root   44 Jun  7 14:09 rocsparse
drwxr-xr-x  3 root root   29 Jun  7 14:09 rocthrust
```

## Using HipBlas Library

The basic code in HIP that uses hipBlas looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `hipblas.hip.cpp`.

```console
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>


int main()
{
    srand(9600);

    int width = 10;
    int height = 7;
    int elem_count = width * height;


    // initialization of data in CPU memory

    float * h_A;
    hipHostMalloc(&h_A, elem_count * sizeof(*h_A));
    for(int i = 0; i < elem_count; i++)
        h_A[i] = (100.0f * rand()) / (float)RAND_MAX;
    printf("Matrix A:\n");
    for(int r = 0; r < height; r++)
    {
        for(int c = 0; c < width; c++)
            printf("%6.3f  ", h_A[r + height * c]);
        printf("\n");
    }

    float * h_x;
    hipHostMalloc(&h_x, width * sizeof(*h_x));
    for(int i = 0; i < width; i++)
        h_x[i] = (100.0f * rand()) / (float)RAND_MAX;
    printf("vector x:\n");
    for(int i = 0; i < width; i++)
        printf("%6.3f  ", h_x[i]);
    printf("\n");

    float * h_y;
    hipHostMalloc(&h_y, height * sizeof(*h_y));
    for(int i = 0; i < height; i++)
        h_x[i] = 100.0f + i;
    printf("vector y:\n");
    for(int i = 0; i < height; i++)
        printf("%6.3f  ", h_x[i]);
    printf("\n");


    // initialization of data in GPU memory

    float * d_A;
    size_t pitch_A;
    hipMallocPitch((void**)&d_A, &pitch_A, height * sizeof(*d_A), width);
    hipMemcpy2D(d_A, pitch_A, h_A, height * sizeof(*d_A), height * sizeof(*d_A), width, hipMemcpyHostToDevice);
    int lda = pitch_A / sizeof(float);

    float * d_x;
    hipMalloc(&d_x, width * sizeof(*d_x));
    hipMemcpy(d_x, h_x, width * sizeof(*d_x), hipMemcpyHostToDevice);

    float * d_y;
    hipMalloc(&d_y, height * sizeof(*d_y));
    hipMemcpy(d_y, h_y, height * sizeof(*d_y), hipMemcpyHostToDevice);


    // basic calculation of the result on the CPU

    float alpha=2.0f, beta=10.0f;

    for(int i = 0; i < height; i++)
        h_y[i] *= beta;
    for(int r = 0; r < height; r++)
        for(int c = 0; c < width; c++)
            h_y[r] += alpha * h_x[c] * h_A[r + height * c];
    printf("result y CPU:\n");
    for(int i = 0; i < height; i++)
        printf("%6.3f  ", h_y[i]);
    printf("\n");


    // calculation of the result on the GPU using the hipBLAS library

    hipblasHandle_t blas_handle;
    hipblasCreate(&blas_handle);

    hipblasSgemv(blas_handle, HIPBLAS_OP_N, height, width, &alpha, d_A, lda, d_x, 1, &beta, d_y, 1);
    hipDeviceSynchronize();

    hipblasDestroy(blas_handle);


    // copy the GPU result to CPU memory and print it
    hipMemcpy(h_y, d_y, height * sizeof(*d_y), hipMemcpyDeviceToHost);
    printf("result y BLAS:\n");
    for(int i = 0; i < height; i++)
        printf("%6.3f  ", h_y[i]);
    printf("\n");


    // free all the allocated memory
    hipFree(d_A);
    hipFree(d_x);
    hipFree(d_y);
    hipHostFree(h_A);
    hipHostFree(h_x);
    hipHostFree(h_y);

    return 0;
}
```

The code compilation can be done as follows:

```console
hipcc hipblas.hip.cpp -o hipblas.x -lhipblas
```

## Using HipSolver Library

The basic code in HIP that uses hipSolver looks like this.
This a full code and you can copy and paste it into a file.
For this example we use `hipsolver.hip.cpp`.

```console
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <algorithm>
#include <hipsolver/hipsolver.h>
#include <hipblas/hipblas.h>

int main()
{
    srand(63456);

    int size = 10;


    // allocation and initialization of data on host. this time we use std::vector

    int h_A_ld = size;
    int h_A_pitch = h_A_ld * sizeof(float);
    std::vector<float> h_A(size * h_A_ld);
    for(int r = 0; r < size; r++)
        for(int c = 0; c < size; c++)
            h_A[r * h_A_ld + c] = (10.0 * rand()) / RAND_MAX;
    printf("System matrix A:\n");
    for(int r = 0; r < size; r++)
    {
        for(int c = 0; c < size; c++)
            printf("%6.3f  ", h_A[r * h_A_ld + c]);
        printf("\n");
    }

    std::vector<float> h_b(size);
    for(int i = 0; i < size; i++)
        h_b[i] = (10.0 * rand()) / RAND_MAX;
    printf("RHS vector b:\n");
    for(int i = 0; i < size; i++)
        printf("%6.3f  ", h_b[i]);
    printf("\n");

    std::vector<float> h_x(size);


    // memory allocation on the device and initialization

    float * d_A;
    size_t d_A_pitch;
    hipMallocPitch((void**)&d_A, &d_A_pitch, size, size);
    int d_A_ld = d_A_pitch / sizeof(float);

    float * d_b;
    hipMalloc(&d_b, size * sizeof(float));

    float * d_x;
    hipMalloc(&d_x, size * sizeof(float));

    int * d_piv;
    hipMalloc(&d_piv, size * sizeof(int));

    int * info;
    hipMallocManaged(&info, sizeof(int));

    hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
    hipMemcpy(d_b, h_b.data(), size * sizeof(float), hipMemcpyHostToDevice);


    // solving the system using hipSOLVER

    hipsolverHandle_t solverHandle;
    hipsolverCreate(&solverHandle);

    int wss_trf, wss_trs; // wss = WorkSpace Size
    hipsolverSgetrf_bufferSize(solverHandle, size, size, d_A, d_A_ld, &wss_trf);
    hipsolverSgetrs_bufferSize(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, &wss_trs);
    float * workspace;
    int wss = std::max(wss_trf, wss_trs);
    hipMalloc(&workspace, wss * sizeof(float));

    hipsolverSgetrf(solverHandle, size, size, d_A, d_A_ld, workspace, wss, d_piv, info);
    hipsolverSgetrs(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, workspace, wss, info);

    hipMemcpy(d_x, d_b, size * sizeof(float), hipMemcpyDeviceToDevice);
    hipMemcpy(h_x.data(), d_x, size * sizeof(float), hipMemcpyDeviceToHost);
    printf("Solution vector x:\n");
    for(int i = 0; i < size; i++)
        printf("%6.3f  ", h_x[i]);
    printf("\n");

    hipFree(workspace);

    hipsolverDestroy(solverHandle);


    // perform matrix-vector multiplication A*x using hipBLAS to check if the solution is correct

    hipblasHandle_t blasHandle;
    hipblasCreate(&blasHandle);

    float alpha = 1;
    float beta = 0;
    hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
    hipblasSgemv(blasHandle, HIPBLAS_OP_N, size, size, &alpha, d_A, d_A_ld, d_x, 1, &beta, d_b, 1);
    hipDeviceSynchronize();

    hipblasDestroy(blasHandle);

    for(int i = 0; i < size; i++)
        h_b[i] = 0;
    hipMemcpy(h_b.data(), d_b, size * sizeof(float), hipMemcpyDeviceToHost);
    printf("Check multiplication vector Ax:\n");
    for(int i = 0; i < size; i++)
        printf("%6.3f  ", h_b[i]);
    printf("\n");


    // free all the allocated memory

    hipFree(info);
    hipFree(d_piv);
    hipFree(d_x);
    hipFree(d_b);
    hipFree(d_A);

    return 0;
}
```

The code compilation can be done as follows:

```console
hipcc hipsolver.hip.cpp -o hipsolver.x -lhipblas -lhipsolver
```

## Using OpenMP Offload to Program AMD GPUs

The ROCm™ installation includes an LLVM-based implementation that fully supports the OpenMP 4.5 standard
and a subset of the OpenMP 5.0 standard.
Fortran, C/C++ compilers, and corresponding runtime libraries are included.

The OpenMP toolchain is automatically installed as part of the standard ROCm installation
and is available under `/opt/rocm/llvm`. The sub-directories are:

- `bin` : Compilers (flang and clang) and other binaries.
- `examples` : The usage section below shows how to compile and run these programs.
- `include` : Header files.
- `lib` : Libraries including those required for target offload.
- `lib-debug` : Debug versions of the above libraries.

More information can be found in the [AMD OpenMP Support Guide](https://docs.amd.com/bundle/OpenMP-Support-Guide-v5.5/page/Introduction_to_OpenMP_Support_Guide.html).

## Compilation of OpenMP Code

Basic example that uses OpenMP offload is here.
Again, code is complete and can be copied and pasted into a file.
Here we use `vadd.cpp`.

```console
#include <cstdio>
#include <cstdlib>

int main(int argc, char ** argv)
{
    long long count = 1 << 20;
    if(argc > 1)
        count = atoll(argv[1]);
    long long print_count = 16;
    if(argc > 2)
        print_count = atoll(argv[2]);

    long long * a = new long long[count];
    long long * b = new long long[count];
    long long * c = new long long[count];

#pragma omp parallel for
    for(long long i = 0; i < count; i++)
    {
        a[i] = i;
        b[i] = 10 * i;
    }

    printf("A: ");
    for(long long i = 0; i < print_count; i++)
        printf("%3lld ", a[i]);
    printf("\n");

    printf("B: ");
    for(long long i = 0; i < print_count; i++)
        printf("%3lld ", b[i]);
    printf("\n");

#pragma omp target map(to: a[0:count],b[0:count]) map(from: c[0:count])
#pragma omp teams distribute parallel for
    for(long long i = 0; i < count; i++)
    {
        c[i] = a[i] + b[i];
    }

    printf("C: ");
    for(long long i = 0; i < print_count; i++)
        printf("%3lld ", c[i]);
    printf("\n");

    delete[] a;
    delete[] b;
    delete[] c;

    return 0;
}
```

This code can be compiled like this:

```console
/opt/rocm/llvm/bin/clang++ -O3 -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 vadd.cpp -o vadd.x
```

These options are required for target offload from an OpenMP program:

- `-target x86_64-pc-linux-gnu`
- `-fopenmp`
- `-fopenmp-targets=amdgcn-amd-amdhsa`
- `-Xopenmp-target=amdgcn-amd-amdhsa`

This flag specifies the GPU architecture of targeted GPU.
You need to chage this when moving for instance to LUMI with MI250X GPU.
The MI100 GPUs presented in CS have code `gfx908`:

- `-march=gfx908`

Note: You also have to include the `O0`, `O2`, `O3` or `O3` flag.
Without this flag the execution of the compiled code fails.
+49 −0
Original line number Diff line number Diff line
# Using ARM Partition

For testing your application on the ARM partition,
you need to prepare a job script for that partition or use the interactive job:

```
salloc -A PROJECT-ID -p p01-arm
```

On the partition, you should reload the list of modules:

```
ml architecture/aarch64
```

For compilation, `gcc` and `OpenMPI` compilers are available.
Hence, the compilation process should be the same as on the `x64` architecture.

Let's have the following `hello world` example:

```
#include "mpi.h"
#include "omp.h"

int main(int argc, char **argv)
{
        int rank;
        MPI_Init(&argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        #pragma omp parallel
        {
                printf("Hello on rank %d, thread %d\n", rank, omp_get_thread_num());
        }
        MPI_Finalize();
}
```

You can compile and run the example:

```
ml OpenMPI/4.1.4-GCC-11.3.0
mpic++ -fopenmp hello.cpp -o hello
mpirun -n 4 ./hello
```

Please see [gcc options](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html) for more advanced compilation settings.
No complications are expected as long as the application does not use any intrinsic for `x64` architecture.
If you want to use intrinsic,
[SVE](https://developer.arm.com/documentation/102699/0100/Optimizing-with-intrinsics) instruction set is available.
+301 −0
Original line number Diff line number Diff line
# Using NVIDIA Grace Partition

For testing your application on the NVIDIA Grace Partition,
you need to prepare a job script for that partition or use the interactive job:

```console
salloc -N 1 -c 144 -A PROJECT-ID -p p11-grace --time=08:00:00
```

where:

- `-N 1` means allocation single node,
- `-c 144` means allocation 144 cores,
- `-p p11-grace` is NVIDIA Grace partition,
- `--time=08:00:00` means allocation for 8 hours.

## Available Toolchains

The platform offers three toolchains:

- Standard GCC (as a module `ml GCC`)
- [NVHPC](https://developer.nvidia.com/hpc-sdk) (as a module `ml NVHPC`)
- [Clang for NVIDIA Grace](https://developer.nvidia.com/grace/clang) (installed in `/opt/nvidia/clang`)

!!! note
    The NVHPC toolchain showed strong results with minimal amount of tuning necessary in our initial evaluation.

### GCC Toolchain

The GCC compiler seems to struggle with vectorization of short (constant length) loops, which tend to get completely unrolled/eliminated instead of being vectorized. For example simple nested loop such as

```cpp
for(int i = 0; i < 1000000; ++i) {
    // Iterations dependent in "i"
    // ...
    for(int j = 0; j < 8; ++j) {
        // but independent in "j"
        // ...
    }
}
```

may emit scalar code for the inner loop leading to no vectorization being used at all.

### Clang (For Grace) Toolchain

The Clang/LLVM tends to behave similarly, but can be guided to properly vectorize the inner loop with either flags `-O3 -ffast-math -march=native -fno-unroll-loops -mllvm -force-vector-width=8` or pragmas such as `#pragma clang loop vectorize_width(8)` and `#pragma clang loop unroll(disable)`.

```cpp
for(int i = 0; i < 1000000; ++i) {
    // Iterations dependent in "i"
    // ...
    #pragma clang loop unroll(disable) vectorize_width(8)
    for(int j = 0; j < 8; ++j) {
        // but independent in "j"
        // ...
    }
}
```

!!! note
    Our basic experiments show that fixed width vectorization (NEON) tends to perform better in the case of short (register-length) loops than SVE. In cases (like above), where specified `vectorize_width` is larger than availiable vector unit width, Clang will emit multiple NEON instructions (eg. 4 instructions will be emitted to process 8 64-bit operations in 128-bit units of Grace).

### NVHPC Toolchain

The NVHPC toolchain handled aforementioned case without any additional tuning. Simple `-O3 -march=native -fast` should be therefore sufficient.

## Basic Math Libraries

The basic libraries (BLAS and LAPACK) are included in NVHPC toolchain and can be used simply as `-lblas` and `-llapack` for BLAS and LAPACK respectively (`lp64` and `ilp64` versions are also included).

!!! note
    The Grace platform doesn't include CUDA-capable GPU, therefore `nvcc` will fail with an error. This means that `nvc`, `nvc++` and `nvfortran` should be used instead.

### NVIDIA Performance Libraries

The [NVPL](https://developer.nvidia.com/nvpl) package includes more extensive set of libraries in both sequential and multi-threaded versions:

- BLACS: `-lnvpl_blacs_{lp64,ilp64}_{mpich,openmpi3,openmpi4,openmpi5}`
- BLAS: `-lnvpl_blas_{lp64,ilp64}_{seq,gomp}`
- FFTW: `-lnvpl_fftw`
- LAPACK: `-lnvpl_lapack_{lp64,ilp64}_{seq,gomp}`
- ScaLAPACK: `-lnvpl_scalapack_{lp64,ilp64}`
- RAND: `-lnvpl_rand` or `-lnvpl_rand_mt`
- SPARSE: `-lnvpl_sparse`

This package should be compatible with all availiable toolchains and includes CMake module files for easy integration into CMake-based projects. For further documentation see also [NVPL](https://docs.nvidia.com/nvpl).

### Recommended BLAS Library

We recommend to use the multi-threaded BLAS library from the NVPL package.

!!! note
    It is important to pin the processes using **OMP_PROC_BIND=spread**

Example:

```console
$ ml NVHPC
$ nvc -O3 -march=native myprog.c -o myprog -lnvpl_blas_lp64_gomp
$ OMP_PROC_BIND=spread ./myprog
```

## Basic Communication Libraries

The OpenMPI 4 implementation is included with NVHPC toolchain and is exposed as a module (`ml OpenMPI`). The following example

```cpp
#include <mpi.h>
#include <sched.h>
#include <omp.h>

int main(int argc, char **argv)
{
        int rank;
        MPI_Init(&argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        #pragma omp parallel
        {
                printf("Hello on rank %d, thread %d on CPU %d\n", rank, omp_get_thread_num(), sched_getcpu());
        }
        MPI_Finalize();
}
```

can be compiled and run as follows

```console
ml OpenMPI
mpic++ -fast -fopenmp hello.cpp -o hello
OMP_PROC_BIND=close OMP_NUM_THREADS=4 mpirun -np 4 --map-by slot:pe=36 ./hello
```

In this configuration we run 4 ranks bound to one quarter of cores each with 4 OpenMP threads.

## Simple BLAS Application

The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

Stationary probability vector estimation in `C++`:

```cpp
#include <iostream>
#include <vector>
#include <chrono>
#include "cblas.h"

const size_t ITERATIONS  = 32;
const size_t MATRIX_SIZE = 1024;

int main(int argc, char *argv[])
{
    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
    a[0] = 0.5f;

    std::vector<float> w1(matrixElements, 0.0f);
    std::vector<float> w2(matrixElements, 0.0f);

    std::copy(a.begin(), a.end(), w1.begin());

    std::vector<float> *t1, *t2;
    t1 = &w1;
    t2 = &w2;

    auto c1 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < ITERATIONS; ++i)
    {
        std::fill(t2->begin(), t2->end(), 0.0f);

        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                    1.0f, t1->data(), MATRIX_SIZE,
                    a.data(), MATRIX_SIZE,
                    1.0f, t2->data(), MATRIX_SIZE);

        std::swap(t1, t2);
    }

    auto c2 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
    {
        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
    }

    std::cout << std::endl;

    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    return 0;
}
```

Stationary probability vector estimation in `Fortran`:

```fortran
program main
    implicit none

    integer :: matrix_size, iterations
    integer :: i
    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
    real, pointer :: out_data(:), out_diag(:)
    integer :: cr, cm, c1, c2

    iterations  = 32
    matrix_size = 1024

    call system_clock(count_rate=cr)
    call system_clock(count_max=cm)

    allocate(a(matrix_size, matrix_size))
    allocate(w1(matrix_size, matrix_size))
    allocate(w2(matrix_size, matrix_size))

    a(:,:) = 1.0 / real(matrix_size)
    a(:,1) = 0.5 / real(matrix_size - 1)
    a(1,1) = 0.5

    w1 = a
    w2(:,:) = 0.0

    t1 => w1
    t2 => w2

    call system_clock(c1)

    do i = 0, iterations
        t2(:,:) = 0.0

        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

        tmp => t1
        t1  => t2
        t2  => tmp
    end do

    call system_clock(c2)

    out_data(1:size(t1)) => t1
    out_diag => out_data(1::matrix_size+1)

    print *, out_diag
    print *, "Elapsed Time: ", (c2 - c1) / real(cr)

    deallocate(a)
    deallocate(w1)
    deallocate(w2)
end program main
```

### Using NVHPC Toolchain

The C++ version of the example can be compiled with NVHPC and ran as follows

```console
ml NVHPC
nvc++ -O3 -march=native -fast -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lblas main.cpp -o main
OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
```

The Fortran version is just as simple:

```console
ml NVHPC
nvfortran -O3 -march=native -fast -lblas main.f90 -o main.x
OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
```

!!! note
    It may be advantageous to use NVPL libraries instead NVHPC ones. For example DGEMM BLAS 3 routine from NVPL is almost 30% faster than NVHPC one.

### Using Clang (For Grace) Toolchain

Similarly Clang for Grace toolchain with NVPL BLAS can be used to compile C++ version of the example.

```console
ml NVHPC
/opt/nvidia/clang/17.23.11/bin/clang++ -O3 -march=native -ffast-math -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lnvpl_blas_lp64_gomp main.cpp -o main
```

!!! note
    NVHPC module is used just for the `cblas.h` include in this case. This can be avoided by changing the code to use `nvpl_blas.h` instead.

## Additional Resources

- [https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/][1]
- [https://developer.nvidia.com/hpc-sdk][2]
- [https://developer.nvidia.com/grace/clang][3]
- [https://docs.nvidia.com/nvpl][4]

[1]: https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/
[2]: https://developer.nvidia.com/hpc-sdk
[3]: https://developer.nvidia.com/grace/clang
[4]: https://docs.nvidia.com/nvpl
+280 −0
Original line number Diff line number Diff line
# Heterogeneous Memory Management on Intel Platforms

Partition `p10-intel` offser heterogeneous memory directly exposed to the user. This allows to manually pick appropriate kind of memory to be used at process or even single allocation granularity. Both kinds of memory are exposed as memory-only NUMA nodes. This allows both coarse (process level) and fine (allocation level) grained control over memory type used.

## Overview

At the process level the `numactl` facilities can be utilized, while Intel provided `memkind` library allows for finer control. Both `memkind` library and `numactl` can be accessed by loading `memkind` module or `OpenMPI` module (only `numactl`).

```bash
ml memkind
```

### Process Level (NUMACTL)

The `numactl` allows to either restrict memory pool of the process to specific set of memory NUMA nodes

```bash
numactl --membind <node_ids_set>
```

or select single preffered node

```bash
numactl --preffered <node_id>
```

where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation.

Convenient way to check `numactl` configuration is

```bash
numactl -s
```

which prints configuration in its execution environment eg.

```bash
numactl --membind 8-15 numactl -s
policy: bind
preferred node: 0
physcpubind: 0 1 2 ... 189 190 191
cpubind: 0 1 2 3 4 5 6 7
nodebind: 0 1 2 3 4 5 6 7
membind: 8 9 10 11 12 13 14 15
```

The last row shows allocations memory are restricted to NUMA nodes `8-15`.

### Allocation Level (MEMKIND)

The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing:

```cpp
void *pData = malloc(<SIZE>);
/* ... */
free(pData);
```

with

```cpp
#include <memkind.h>

void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>);
/* ... */
memkind_free(NULL, pData); // "kind" parameter is deduced from the address
```

Similarly other memory types can be chosen.

!!! note
    The allocation will return `NULL` pointer when memory of specified kind is not available.

## High Bandwidth Memory (HBM)

Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running

```bash
numactl -H
```

which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).

![](../../img/cs/guides/p10_numa_sc4_flat.png)

### Process Level

With this we can easily restrict application to DDR DRAM or HBM memory:

```bash
# Only DDR DRAM
numactl --membind 0-7 ./stream
# ...
Function    Best Rate MB/s  Avg time     Min time     Max time
Copy:          369745.8     0.043355     0.043273     0.043588
Scale:         366989.8     0.043869     0.043598     0.045355
Add:           378054.0     0.063652     0.063483     0.063899
Triad:         377852.5     0.063621     0.063517     0.063884

# Only HBM
numactl --membind 8-15 ./stream
# ...
Function    Best Rate MB/s  Avg time     Min time     Max time
Copy:         1128430.1     0.015214     0.014179     0.015615
Scale:        1045065.2     0.015814     0.015310     0.016309
Add:          1096992.2     0.022619     0.021878     0.024182
Triad:        1065152.4     0.023449     0.022532     0.024559
```

The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar.

Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like

```bash
#!/bin/bash
numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@
```

and can be used as

```bash
mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi
```

(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise

```bash
mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi
```

is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:
![](../../img/cs/guides/p10_stream_dram.png)
![](../../img/cs/guides/p10_stream_hbm.png)

### Allocation Level

Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows

```cpp
#include <memkind.h>
// ...
STREAM_TYPE *a = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
STREAM_TYPE *b = (STREAM_TYPE *)memkind_malloc(MEMKIND_REGULAR, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
STREAM_TYPE *c = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
// ...
memkind_free(NULL, a);
memkind_free(NULL, b);
memkind_free(NULL, c);
```

Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B.
The code then has to be linked with `memkind` library

```bash
gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream
```

and can be run as

```bash
export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream
```

While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.

![](../../img/cs/guides/p10_stream_memkind.png)

With this setup we can see that simple copy operation (C[i] = A[i]) achieves bandwidth comparable to the application bound entirely to HBM memory. On the other hand the scale operation (B[i] = s*C[i]) is mostly limited by DDR DRAM bandwidth. Its also worth noting that operations combining all three arrays are performing close to HBM-only configuration.

## Simple Application

One of applications that can greatly benefit from availability of large slower and faster smaller memory is computing histogram with many bins over large dataset.

```cpp
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cstring>
#include <omp.h>
#include <memkind.h>

const size_t N_DATA_SIZE  = 2 * 1024 * 1024 * 1024ull;
const size_t N_BINS_COUNT = 1 * 1024 * 1024ull;
const size_t N_ITERS      = 10;

#if defined(HBM)
    #define DATA_MEMKIND MEMKIND_REGULAR
    #define BINS_MEMKIND MEMKIND_HBW_ALL
#else
    #define DATA_MEMKIND MEMKIND_REGULAR
    #define BINS_MEMKIND MEMKIND_REGULAR
#endif

int main(int argc, char *argv[])
{
    const double binWidth = 1.0 / double(N_BINS_COUNT + 1);

    double *pData = (double *)memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double));
    size_t *pBins = (size_t *)memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double));

    #pragma omp parallel
    {
        drand48_data state;
        srand48_r(omp_get_thread_num(), &state);

        #pragma omp for
        for(size_t i = 0; i < N_DATA_SIZE; ++i)
            drand48_r(&state, &pData[i]);
    }

    auto c1 = std::chrono::steady_clock::now();

    for(size_t it = 0; it < N_ITERS; ++it)
    {
        #pragma omp parallel
        {
            for(size_t i = 0; i < N_BINS_COUNT; ++i)
                pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0);

            #pragma omp for
            for(size_t i = 0; i < N_DATA_SIZE; ++i)
            {
                const size_t idx = size_t(pData[i] / binWidth) % N_BINS_COUNT;
                pBins[omp_get_thread_num()*N_BINS_COUNT + idx]++;
            }
        }
    }

    auto c2 = std::chrono::steady_clock::now();

    #pragma omp parallel for
    for(size_t i = 0; i < N_BINS_COUNT; ++i)
    {
        for(size_t j = 1; j < omp_get_max_threads(); ++j)
            pBins[i] += pBins[j*N_BINS_COUNT + i];
    }

    std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    size_t total = 0;
    #pragma omp parallel for reduction(+:total)
    for(size_t i = 0; i < N_BINS_COUNT; ++i)
        total += pBins[i];

    std::cout << "Total Items: " << total << std::endl;

    memkind_free(NULL, pData);
    memkind_free(NULL, pBins);

    return 0;
}
```

### Using HBM Memory (P10-Intel)

Following commands can be used to compile and run example application above

```bash
ml GCC memkind
export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
g++ -O3 -fopenmp -lmemkind histogram.cpp -o histogram_dram
g++ -O3 -fopenmp -lmemkind -DHBM histogram.cpp -o histogram_hbm
OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_dram
OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_hbm
```

Moving histogram bins data into HBM memory should speedup the algorithm more than twice. It should be noted that moving also `pData` array into HBM memory worsens this result (presumably because the algorithm can saturate both memory interfaces).

## Additional Resources

- [https://linux.die.net/man/8/numactl][1]
- [http://memkind.github.io/memkind/man_pages/memkind.html][2]
- [https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory][3]

[1]: https://linux.die.net/man/8/numactl
[2]: http://memkind.github.io/memkind/man_pages/memkind.html
[3]: https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory
 No newline at end of file
+79 −0
Original line number Diff line number Diff line
# Using VMware Horizon

VMware Horizon is a virtual desktop infrastructure (VDI) solution
that enables users to access virtual desktops and applications from any device and any location.
It provides a comprehensive end-to-end solution for managing and delivering virtual desktops and applications,
including features such as session management, user authentication, and virtual desktop provisioning.

![](../../img/horizon.png)

## How to Access VMware Horizon

!!! important
    Access to VMware Horizon requires IT4I VPN.

1. Contact [IT4I support][a] with a request for an access and VM allocation.
1. [Download][1] and install the VMware Horizon Client for Windows.
1. Add a new server `https://vdi-cs01.msad.it4i.cz/` in the Horizon client.
1. Connect to the server using your IT4I username and password.
   Username is in the `domain\username` format and the domain is `msad.it4i.cz`.
   For example: `msad.it4i.cz\user123`

## Example

Below is an example of how to mount a remote folder and check the conection on Windows OS:

### Prerequsities

3D applications

* [Blender][3]

SSHFS for remote access

* [sshfs-win][4]
* [winfsp][5]
* [shfs-win-manager][6]
* ssh keys for access to clusters

### Steps

1. Start the VPN and connect to the server via VMware Horizon Client.

    ![](../../img/vmware.png)

1. Mount a remote folder.
    * Run sshfs-win-manager.

    ![](../../img/sshfs.png)

    * Add a new connection.

    ![](../../img/sshfs1.png)

    * Click on **Connect**.

    ![](../../img/sshfs2.png)

1. Check that the folder is mounted.

    ![](../../img/mount.png)

1. Check the GPU resources.

    ![](../../img/gpu.png)

### Blender

Now if you run, for example, Blender, you can check the available GPU resources in Blender Preferences.

  ![](../../img/blender.png)

[a]: mailto:support@it4i.cz

[1]: https://vdi-cs01.msad.it4i.cz/
[2]: https://www.paraview.org/download/
[3]: https://www.blender.org/download/
[4]: https://github.com/winfsp/sshfs-win/releases
[5]: https://github.com/winfsp/winfsp/releases/
[6]: https://github.com/evsar3/sshfs-win-manager/releases
+227 −0
Original line number Diff line number Diff line
# Using IBM Power Partition

For testing your application on the IBM Power partition,
you need to prepare a job script for that partition or use the interactive job:

```console
scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00
```

where:

- `-N 1` means allocation single node,
- `-c 192` means allocation 192 cores (threads),
- `-p p07-power` is IBM Power partition,
- `--time=08:00:00` means allocation for 8 hours.

On the partition, you should reload the list of modules:

```
ml architecture/ppc64le
```

The platform offers both `GNU` based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library ([ESSL](https://www.ibm.com/docs/en/essl/6.1)), which can be used by both toolchain.

## Building Applications

Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use):

```
ml GCC OpenBLAS
```

### GCC Toolchain

In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++`

```
g++ -lopenblas hello.cpp -o hello
```

or `gfortran`

```
gfortran -lopenblas hello.f90 -o hello
```

as usual.

### IBM Toolchain

The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module

```
IBM_ROOT=/opt/ibm
OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1

export PATH=$OPENXLC_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH

export PATH=$OPENXLF_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
```

from there we can use either `ibm-clang++`

```
ibm-clang++ -lopenblas hello.cpp -o hello
```

or `xlf`

```
xlf -lopenblas hello.f90 -o hello
```

to build the application as usual.

!!! note
    Combination of `xlf` and `openblas` seems to cause severe performance degradation. Therefore `ESSL` library should be preferred (see below).

### Using ESSL Library

The [ESSL](https://www.ibm.com/docs/en/essl/6.1) library is installed in `/opt/ibm/math/essl/7.1` so we define additional environment variables

```
IBM_ROOT=/opt/ibm
ESSL_ROOT=${IBM_ROOT}math/essl/7.1
export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
```

The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines).
In practice this can look like

```
g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
```

or

```
gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
```

and similarly for IBM compilers (`ibm-clang++` and `xlf`).

## Hello World Applications

The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

Stationary probability vector estimation in `C++`:

```c++
#include <iostream>
#include <vector>
#include <chrono>
#include "cblas.h"

const size_t ITERATIONS  = 32;
const size_t MATRIX_SIZE = 1024;

int main(int argc, char *argv[])
{
    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
    a[0] = 0.5f;

    std::vector<float> w1(matrixElements, 0.0f);
    std::vector<float> w2(matrixElements, 0.0f);

    std::copy(a.begin(), a.end(), w1.begin());

    std::vector<float> *t1, *t2;
    t1 = &w1;
    t2 = &w2;

    auto c1 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < ITERATIONS; ++i)
    {
        std::fill(t2->begin(), t2->end(), 0.0f);

        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                    1.0f, t1->data(), MATRIX_SIZE,
                    a.data(), MATRIX_SIZE,
                    1.0f, t2->data(), MATRIX_SIZE);

        std::swap(t1, t2);
    }

    auto c2 = std::chrono::steady_clock::now();

    for(size_t i = 0; i < MATRIX_SIZE; ++i)
    {
        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
    }

    std::cout << std::endl;

    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

    return 0;
}
```

Stationary probability vector estimation in `Fortran`:

```fortran
program main
    implicit none

    integer :: matrix_size, iterations
    integer :: i
    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
    real, pointer :: out_data(:), out_diag(:)
    integer :: cr, cm, c1, c2

    iterations  = 32
    matrix_size = 1024

    call system_clock(count_rate=cr)
    call system_clock(count_max=cm)

    allocate(a(matrix_size, matrix_size))
    allocate(w1(matrix_size, matrix_size))
    allocate(w2(matrix_size, matrix_size))

    a(:,:) = 1.0 / real(matrix_size)
    a(:,1) = 0.5 / real(matrix_size - 1)
    a(1,1) = 0.5

    w1 = a
    w2(:,:) = 0.0

    t1 => w1
    t2 => w2

    call system_clock(c1)

    do i = 0, iterations
        t2(:,:) = 0.0

        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

        tmp => t1
        t1  => t2
        t2  => tmp
    end do

    call system_clock(c2)

    out_data(1:size(t1)) => t1
    out_diag => out_data(1::matrix_size+1)

    print *, out_diag
    print *, "Elapsed Time: ", (c2 - c1) / real(cr)

    deallocate(a)
    deallocate(w1)
    deallocate(w2)
end program main
```
+53 −0
Original line number Diff line number Diff line
# Introduction

Complementary systems offer development environment for users
that need to port and optimize their code and applications
for various hardware architectures and software technologies
that are not available on standard clusters.

## Complementary Systems 1

First stage of complementary systems implementation comprises of these partitions:

- compute partition 0 – based on ARM technology - legacy
- compute partition 1 – based on ARM technology - A64FX
- compute partition 2 – based on Intel technologies - Ice Lake, NVDIMMs + Bitware FPGAs
- compute partition 3 – based on AMD technologies - Milan, MI100 GPUs + Xilinx FPGAs
- compute partition 4 – reflecting Edge type of servers
- partition 5 – FPGA synthesis server

![](../img/cs1_1.png)

## Complementary Systems 2

Second stage of complementary systems implementation comprises of these partitions:

- compute partition 6 - based on ARM technology + CUDA programmable GPGPU accelerators on ampere architecture + DPU network processing units
- compute partition 7 - based on IBM Power10 architecture
- compute partition 8 - modern CPU with a very high L3 cache capacity (over 750MB)
- compute partition 9 - virtual GPU accelerated workstations
- compute partition 10 - Sapphire Rapids-HBM server
- compute partition 11 - NVIDIA Grace CPU Superchip

![](../img/cs2_2.png)

## Modules and Architecture Availability

Complementary systems list available modules automatically based on the detected architecture.

However, you can load one of the three modules -- `aarch64`, `avx2`, and `avx512` --
to reload the list of modules available for the respective architecture:

```console
[user@login.cs ~]$ ml architecture/aarch64

  aarch64 modules + all modules

[user@login.cs ~]$ ml architecture/avx2

  avx2 modules + all modules

[user@login.cs ~]$ ml architecture/avx512

  avx512 modules + all modules
```
+438 −0
Original line number Diff line number Diff line
# Complementary System Job Scheduling

## Introduction

[Slurm][1] workload manager is used to allocate and access Complementary systems resources.

## Getting Partition Information

Display partitions/queues

```console
$ sinfo -s
PARTITION AVAIL  TIMELIMIT   NODES(A/I/O/T) NODELIST
p00-arm      up 1-00:00:00          0/1/0/1 p00-arm01
p01-arm*     up 1-00:00:00          0/8/0/8 p01-arm[01-08]
p02-intel    up 1-00:00:00          0/2/0/2 p02-intel[01-02]
p03-amd      up 1-00:00:00          0/2/0/2 p03-amd[01-02]
p04-edge     up 1-00:00:00          0/1/0/1 p04-edge01
p05-synt     up 1-00:00:00          0/1/0/1 p05-synt01
p06-arm      up 1-00:00:00          0/2/0/2 p06-arm[01-02]
p07-power    up 1-00:00:00          0/1/0/1 p07-power01
p08-amd      up 1-00:00:00          0/1/0/1 p08-amd01
p10-intel    up 1-00:00:00          0/1/0/1 p10-intel01
```

## Getting Job Information

Show jobs

```console
$ squeue --me
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
               104   p01-arm interact    user   R       1:48      2 p01-arm[01-02]
```

Show job details for specific job

```console
$ scontrol -d show job JOBID
```

Show job details for executing job from job session

```console
$ scontrol -d show job $SLURM_JOBID
```

## Running Interactive Jobs

Run interactive job

```console
 $ salloc -A PROJECT-ID -p p01-arm
```

Run interactive job, with X11 forwarding

```console
 $ salloc -A PROJECT-ID -p p01-arm --x11
```

!!! warning
    Do not use `srun` for initiating interactive jobs, subsequent `srun`, `mpirun` invocations would block forever.

## Running Batch Jobs

Run batch job

```console
 $ sbatch -A PROJECT-ID -p p01-arm ./script.sh
```

Useful command options (salloc, sbatch, srun)

* -n, --ntasks
* -c, --cpus-per-task
* -N, --nodes

## Slurm Job Environment Variables

Slurm provides useful information to the job via environment variables. Environment variables are available on all nodes allocated to job when accessed via Slurm supported means (srun, compatible mpirun).

See all Slurm variables

```
set | grep ^SLURM
```

### Useful Variables

| variable name | description | example |
| ------ | ------ | ------ |
| SLURM_JOB_ID | job id of the executing job| 593 |
| SLURM_JOB_NODELIST | nodes allocated to the job | p03-amd[01-02] |
| SLURM_JOB_NUM_NODES | number of nodes allocated to the job | 2 |
| SLURM_STEP_NODELIST | nodes allocated to the job step | p03-amd01 |
| SLURM_STEP_NUM_NODES | number of nodes allocated to the job step | 1 |
| SLURM_JOB_PARTITION | name of the partition | p03-amd |
| SLURM_SUBMIT_DIR | submit directory | /scratch/project/open-xx-yy/work |

See [Slurm srun documentation][2] for details.

Get job nodelist

```
$ echo $SLURM_JOB_NODELIST
p03-amd[01-02]
```

Expand nodelist to list of nodes.

```
$ scontrol show hostnames $SLURM_JOB_NODELIST
p03-amd01
p03-amd02
```

## Modifying Jobs

```
$ scontrol update JobId=JOBID ATTR=VALUE
```

for example

```
$ scontrol update JobId=JOBID Comment='The best job ever'
```

## Deleting Jobs

```
$ scancel JOBID
```

## Partitions

| PARTITION | nodes | whole node | cores per node | features |
| --------- | ----- | ---------- | -------------- | -------- |
| p00-arm   | 1     | yes        | 64             | aarch64,cortex-a72 |
| p01-arm   | 8     | yes        | 48             | aarch64,a64fx,ib |
| p02-intel | 2     | no         | 64             | x86_64,intel,icelake,ib,fpga,bitware,nvdimm |
| p03-amd   | 2     | no         | 64             | x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx |
| p04-edge  | 1     | yes        | 16             | 86_64,intel,broadwell,ib |
| p05-synt  | 1     | yes        | 8              | x86_64,amd,milan,ib,ht |
| p06-arm   | 2     | yes        | 80             | aarch64,ib |
| p07-power | 1     | yes        | 192            | ppc64le,ib |
| p08-amd   | 1     | yes        | 128            | x86_64,amd,milan-x,ib,ht |
| p10-intel | 1     | yes        | 96             | x86_64,intel,sapphire_rapids,ht|

Use `-t`, `--time` option to specify job run time limit. Default job time limit is 2 hours, maximum job time limit is 24 hours.

FIFO scheduling with backfilling is employed.

## Partition 00 - ARM (Cortex-A72)

Whole node allocation.

One node:

```console
salloc -A PROJECT-ID -p p00-arm
```

## Partition 01 - ARM (A64FX)

Whole node allocation.

One node:

```console
salloc -A PROJECT-ID -p p01-arm
```

```console
salloc -A PROJECT-ID -p p01-arm -N=1
```

Multiple nodes:

```console
salloc -A PROJECT-ID -p p01-arm -N=8
```

## Partition 02 - Intel (Ice Lake, NVDIMMs + Bitware FPGAs)

FPGAs are treated as resources. See below for more details about resources.

Partial allocation - per FPGA, resource separation is not enforced.
Use only FPGAs allocated to the job!

One FPGA:

```console
salloc -A PROJECT-ID -p p02-intel --gres=fpga
```

Two FPGAs on the same node:

```console
salloc -A PROJECT-ID -p p02-intel --gres=fpga:2
```

All FPGAs:

```console
salloc -A PROJECT-ID -p p02-intel -N 2 --gres=fpga:2
```

## Partition 03 - AMD (Milan, MI100 GPUs + Xilinx FPGAs)

GPUs and FPGAs are treated as resources. See below for more details about resources.

Partial allocation - per GPU and per FPGA, resource separation is not enforced.
Use only GPUs and FPGAs allocated to the job!

One GPU:

```console
salloc -A PROJECT-ID -p p03-amd --gres=gpu
```

Two GPUs on the same node:

```console
salloc -A PROJECT-ID -p p03-amd --gres=gpu:2
```

Four GPUs on the same node:

```console
salloc -A PROJECT-ID -p p03-amd --gres=gpu:4
```

All GPUs:

```console
salloc -A PROJECT-ID -p p03-amd -N 2 --gres=gpu:4
```

One FPGA:

```console
salloc -A PROJECT-ID -p p03-amd --gres=fpga
```

Two FPGAs:

```console
salloc -A PROJECT-ID -p p03-amd --gres=fpga:2
```

All FPGAs:

```console
salloc -A PROJECT-ID -p p03-amd -N 2--gres=fpga:2
```

One GPU and one FPGA on the same node:

```console
salloc -A PROJECT-ID -p p03-amd --gres=gpu,fpga
```

Four GPUs and two FPGAs on the same node:

```console
salloc -A PROJECT-ID -p p03-amd --gres=gpu:4,fpga:2
```

All GPUs and FPGAs:

```console
salloc -A PROJECT-ID -p p03-amd -N 2 --gres=gpu:4,fpga:2
```

## Partition 04 - Edge Server

Whole node allocation:

```console
salloc -A PROJECT-ID -p p04-edge
```

## Partition 05 - FPGA Synthesis Server

Whole node allocation:

```console
salloc -A PROJECT-ID -p p05-synt
```

## Partition 06 - ARM

Whole node allocation:

```console
salloc -A PROJECT-ID -p p06-arm
```

## Partition 07 - IBM Power

Whole node allocation:

```console
salloc -A PROJECT-ID -p p07-power
```

## Partition 08 - AMD Milan-X

Whole node allocation:

```console
salloc -A PROJECT-ID -p p08-amd
```

## Partition 10 - Intel Sapphire Rapids

Whole node allocation:

```console
salloc -A PROJECT-ID -p p10-intel
```

## Features

Nodes have feature tags assigned to them.
Users can select nodes based on the feature tags using --constraint option.

| Feature | Description |
| ------ | ------ |
| aarch64 | platform |
| x86_64 | platform |
| ppc64le | platform |
| amd | manufacturer |
| intel | manufacturer |
| icelake | processor family |
| broadwell | processor family |
| sapphire_rapids | processor family |
| milan | processor family |
| milan-x | processor family |
| ib | Infiniband |
| gpu | equipped with GPU |
| fpga | equipped with FPGA |
| nvdimm | equipped with NVDIMMs |
| ht | Hyperthreading enabled |
| noht | Hyperthreading disabled |

```
$ sinfo -o '%16N %f'
NODELIST         AVAIL_FEATURES
p00-arm01        aarch64,cortex-a72
p01-arm[01-08]   aarch64,a64fx,ib
p02-intel01      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,ht
p02-intel02      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
p03-amd02        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,noht
p03-amd01        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
p04-edge01       x86_64,intel,broadwell,ib,ht
p05-synt01       x86_64,amd,milan,ib,ht
p06-arm[01-02]   aarch64,ib
p07-power01      ppc64le,ib
p08-amd01        x86_64,amd,milan-x,ib,ht
p10-intel01      x86_64,intel,sapphire_rapids,ht
```

```
$ salloc -A PROJECT-ID -p p02-intel --constraint noht
```

```
$ scontrol -d show node p02-intel02 | grep ActiveFeatures
   ActiveFeatures=x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
```

## Resources, GRES

Slurm supports the ability to define and schedule arbitrary resources - Generic RESources (GRES) in Slurm's terminology. We use GRES for scheduling/allocating GPUs and FPGAs.

!!! warning
    Use only allocated GPUs and FPGAs. Resource separation is not enforced. If you use non-allocated resources, you can observe strange behavior and get into troubles.

### Node Resources

Get information about GRES on node.

```
$ scontrol -d show node p02-intel01 | grep Gres=
   Gres=fpga:bitware_520n_mx:2
$ scontrol -d show node p02-intel02 | grep Gres=
   Gres=fpga:bitware_520n_mx:2
$ scontrol -d show node p03-amd01 | grep Gres=
   Gres=gpu:amd_mi100:4,fpga:xilinx_alveo_u250:2
$ scontrol -d show node p03-amd02 | grep Gres=
   Gres=gpu:amd_mi100:4,fpga:xilinx_alveo_u280:2
```

### Request Resources

To allocate required resources (GPUs or FPGAs) use the `--gres salloc/srun` option.

Example: Allocate one FPGA

```
$ salloc -A PROJECT-ID -p p03-amd --gres fpga:1
```

### Find Out Allocated Resources

Information about allocated resources is available in Slurm job details, attributes `JOB_GRES` and `GRES`.

```
$ scontrol -d show job $SLURM_JOBID |grep GRES=
   JOB_GRES=fpga:xilinx_alveo_u250:1
     Nodes=p03-amd01 CPU_IDs=0-1 Mem=0 GRES=fpga:xilinx_alveo_u250:1(IDX:0)
```

IDX in the GRES attribute specifies index/indexes of FPGA(s) (or GPUs) allocated to the job on the node. In the given example - allocated resources are `fpga:xilinx_alveo_u250:1(IDX:0)`, we should use FPGA with index/number 0 on node p03-amd01.

### Request Specific Resources

It is possible to allocate specific resources. It is useful for partition p03-amd equipped with FPGAs of different types.

GRES entry is using format "name[[:type]:count", in the following example name is fpga, type is xilinx_alveo_u280, and count is count 2.

```
$ salloc -A PROJECT-ID -p p03-amd --gres=fpga:xilinx_alveo_u280:2
salloc: Granted job allocation XXX
salloc: Waiting for resource configuration
salloc: Nodes p03-amd02 are ready for job

$ scontrol -d show job $SLURM_JOBID | grep -i gres
   JOB_GRES=fpga:xilinx_alveo_u280:2
     Nodes=p03-amd02 CPU_IDs=0 Mem=0 GRES=fpga:xilinx_alveo_u280(IDX:0-1)
   TresPerNode=gres:fpga:xilinx_alveo_u280:2
```

[1]: https://slurm.schedmd.com/
[2]: https://slurm.schedmd.com/srun.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES
+36 −0
Original line number Diff line number Diff line
# Accessing the DGX-2

## Before You Access

!!! warning
    GPUs are single-user devices. GPU memory is not purged between job runs and it can be read (but not written) by any user. Consider the confidentiality of your running jobs.

## How to Access

The DGX-2 machine is integrated into [Barbora cluster][3].
The DGX-2 machine can be accessed from Barbora login nodes `barbora.it4i.cz` through the Barbora scheduler queue qdgx as a compute node cn202.

## Storage

There are three shared file systems on the DGX-2 system: HOME, SCRATCH (LSCRATCH), and PROJECT.

### HOME

The HOME filesystem is realized as an NFS filesystem. This is a shared home from the [Barbora cluster][1].

### SCRATCH

The SCRATCH is realized on an NVME storage. The SCRATCH filesystem is mounted in the `/scratch` directory.
Accessible capacity is 22TB, shared among all users.

!!! warning
    Files on the SCRATCH filesystem that are not accessed for more than 60 days will be automatically deleted.

### PROJECT

The PROJECT data storage is IT4Innovations' central data storage accessible from all clusters.
For more information on accessing PROJECT, its quotas, etc., see the [PROJECT Data Storage][2] section.

[1]: ../../barbora/storage/#home-file-system
[2]: ../../storage/project-storage
[3]: ../../barbora/introduction
+44 −0
Original line number Diff line number Diff line
# Introduction

NVIDIA DGX-2 is a very powerful computational node, featuring high end x86_64 processors and 16 NVIDIA V100-SXM3 GPUs.

| NVIDIA DGX-2  | |
| --- | --- |
| CPUs | 2 x Intel Xeon Platinum |
| GPUs | 16 x NVIDIA Tesla V100 32GB HBM2 |
| System Memory | Up to 1.5 TB DDR4 |
| GPU Memory | 512 GB HBM2 (16 x 32 GB)	|
| Storage | 30 TB NVMe, Up to 60 TB |
| Networking | 8 x Infiniband or 8 x 100 GbE |
| Power | 10 kW	|
| Size | 350 lbs |
| GPU Throughput | Tensor: 1920 TFLOPs, FP16: 520 TFLOPs, FP32: 260 TFLOPs, FP64: 130 TFLOPs |

The [DGX-2][a] introduces NVIDIA’s new NVSwitch, enabling 300 GB/s chip-to-chip communication at 12 times the speed of PCIe.

With NVLink2, it enables 16x NVIDIA V100-SXM3 GPUs in a single system, for a total bandwidth going beyond 14 TB/s.
Featuring pair of Xeon 8168 CPUs, 1.5 TB of memory, and 30 TB of NVMe storage,
we get a system that consumes 10 kW, weighs 163.29 kg, but offers double precision performance in excess of 130TF.

The DGX-2 is designed to be a powerful server in its own right.
On the storage side, the DGX-2 comes with 30TB of NVMe-based solid state storage.
For clustering or further inter-system communications, it also offers InfiniBand and 100GigE connectivity, up to eight of them.

Further, the [DGX-2][b] offers  a total of ~2 PFLOPs of half precision performance in a single system, when using the tensor cores.

![](../img/dgx1.png)

With DGX-2, AlexNET, the network that 'started' the latest machine learning revolution, now takes 18 minutes.

The DGX-2 is able to complete the training process
for FAIRSEQ – a neural network model for language translation – 10x faster than a DGX-1 system,
bringing it down to less than two days total rather than 15 days.

The new NVSwitches means that the PCIe lanes of the CPUs can be redirected elsewhere, most notably towards storage and networking connectivity.
The topology of the DGX-2 means that all 16 GPUs are able to pool their memory into a unified memory space,
though with the usual tradeoffs involved if going off-chip.

![](../img/dgx2-nvlink.png)

[a]: https://www.nvidia.com/content/dam/en-zz/es_em/Solutions/Data-Center/dgx-2/nvidia-dgx-2-datasheet.pdf
[b]: https://www.youtube.com/embed/OTOGw0BRqK0

docs.it4i/dice.md

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

+47 −0
Original line number Diff line number Diff line
# Migration to e-INFRA CZ

## Introduction

IT4Innovations is a part of [e-INFRA CZ][1] - strategic research infrastructure of the Czech Republic, which provides capacities and resources for the transmission, storage, and processing of scientific and research data. In January 2022, IT4I has begun the process of integration of its services.

As a part of the process, a joint e-INFRA CZ user base has been established. This included a migration of eligible IT4I accounts.

## Who Has Been Affected

The migration affects all accounts of users affiliated with an academic organizations in the Czech Republic who also have an OPEN-XX-XX project. Affected users have received an email with information about changes in personal data processing.

## Who Has Not Been Affected

Commercial users, training accounts, suppliers, and service accounts were **not** affected by the migration.

## Process

During the process, additional steps have been required for successful migration.

This may have included:

1. e-INFRA CZ registration, if one does not already exist.
2. e-INFRA CZ password reset, if one does not already exist.

## Steps After Migration

After the migration, you must use your **e-INFRA CZ credentials** to access all IT4I services as well as [e-INFRA CZ services][5].

Successfully migrated accounts tied to e-INFRA CZ can be self-managed at [e-INFRA CZ User profile][4].

!!! tip "Recommendation"
    We recommend [verifying your SSH keys][6] for cluster access.

## Troubleshooting

If you have a problem with your account migrated to e-INFRA CZ user base, contact the [CESNET support][7].

If you have questions or a problem with IT4I account (i.e. account not eligible for migration), contact the [IT4I support][2].

[1]: https://www.e-infra.cz/en
[2]: mailto:support@it4i.cz
[3]: https://www.cesnet.cz/?lang=en
[4]: https://profile.e-infra.cz/
[5]: https://www.e-infra.cz/en/services
[6]: https://profile.e-infra.cz/profile/settings/sshKeys
[7]: mailto:support@cesnet.cz
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/index.md

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/prace.md

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/robots.txt

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/src/css.css

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

material/404.html

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

mkdocs.yml

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

package.json

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

pathcheck.sh

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

requirements.txt

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/add_version.sh

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/clean_json.sh

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/colors.sh

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/get_cvs.sh

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/get_modules.sh

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/maketitle.py

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/matrix.py

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/meta-json.sh

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/movefiles.sh

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/movepublic.sh

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/titlemd.py

0 → 100755
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

scripts/url_test.py

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

snippets/mathjax.md

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.