master to master · SCS / docs.it4i.cz

Some changes are not shown.

For a faster browsing experience, only 55 of 565 files are shown. Download one of the files below to see all changes.

.gitignore

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		site/
		scripts/*.csv
		venv/

.gitlab-ci.yml

0 → 100644

+137 −0

Original line number	Diff line number	Diff line
		stages:
		- test
		- build
		- deploy
		- after_test

		variables:
		PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"

		docs:
		stage: test
		image: it4innovations/docker-mdcheck:latest
		allow_failure: true
		script:
		- find content/docs -name "*.mdx" \| xargs mdl -r ~MD002,~MD007,~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046

		pylint:
		stage: test
		image: it4innovations/docker-pycheck:latest
		before_script:
		- source /opt/.venv3/bin/activate
		script:
		- pylint $(find . -name "*.py" -not -name "feslicescript.py")

		capitalize:
		stage: test
		image: it4innovations/docker-mkdocscheck:latest
		allow_failure: true
		before_script:
		- source /opt/.venv3/bin/activate
		- python -V # debug
		- pip list \| grep titlecase
		script:
		- find content/docs/ $ -name '.mdx' -o -name '.yml' $ ! -path 'einfracz' -print0 \| xargs -0 -n1 scripts/titlemd.py --test

		ext_links:
		stage: after_test
		image: it4innovations/docker-mdcheck:latest
		allow_failure: true
		after_script:
		# remove JSON results
		- rm *.json
		script:
		- find content/docs -name '*.mdx' -exec grep --color -l http {} + \| xargs awesome_bot -t 10 --allow-dupe --allow-redirect
		only:
		- master

		404s:
		stage: after_test
		image: it4innovations/docker-mkdocscheck:latest
		before_script:
		- echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
		- wget -V
		- echo https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
		- wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/ \|\| true
		script:
		- cat wget.log \| awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'

		mkdocs:
		stage: build
		image: it4innovations/docker-mkdocscheck:latest
		before_script:
		- source /opt/.venv3/bin/activate
		- python -V # debug
		- pip install -r requirements.txt
		- pip freeze # debug
		- mkdocs -V # debug
		script:
		# add version to footer
		- bash scripts/add_version.sh
		# get modules list from clusters
		- bash scripts/get_modules.sh
		# generate site_url
		- (if [ "${CI_COMMIT_REF_NAME}" != 'master' ]; then sed -i "s/$site_url.*$$/\1devel\/$CI_COMMIT_REF_NAME\//" mkdocs.yml;fi);
		# generate ULT for code link
		# - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
		# regenerate modules matrix
		- python scripts/modules_matrix.py > docs.it4i/modules-matrix.md
		- python scripts/modules_matrix.py --json > docs.it4i/modules-matrix.json
		- curl -f0 https://code.it4i.cz/sccs/scs-api-public/raw/master/scs_api.server_public.md -o docs.it4i/apiv1.md
		# build pages
		- mkdocs build
		# replace broken links in 404.html
		- sed -i 's,href="" title=",href="/" title=",g' site/404.html
		- cp site/404.html site/403.html
		- sed -i 's/404 - Not found/403 - Forbidden/g' site/403.html
		# compress sitemap
		- gzip < site/sitemap.xml > site/sitemap.xml.gz
		artifacts:
		paths:
		- site
		expire_in: 1 week

		deploy to stage:
		environment: stage
		stage: deploy
		image: it4innovations/docker-mkdocscheck:latest
		before_script:
		# install ssh-agent
		- 'which ssh-agent \|\| ( apt-get update -y && apt-get install openssh-client -y )'
		- 'which rsync \|\| ( apt-get update -y && apt-get install rsync -y )'
		# run ssh-agent
		- eval $(ssh-agent -s)
		# add ssh key stored in SSH_PRIVATE_KEY variable to the agent store
		- ssh-add <(echo "$SSH_PRIVATE_KEY")
		# disable host key checking (NOTE: makes you susceptible to man-in-the-middle attacks)
		# WARNING: use only in docker container, if you use it with shell you will overwrite your user's ssh config
		- mkdir -p ~/.ssh
		- echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
		script:
		- chown nginx:nginx site -R
		- rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
		only:
		- branches@sccs/docs.it4i.cz

		deploy to production:
		environment: production
		stage: deploy
		image: it4innovations/docker-mkdocscheck:latest
		before_script:
		# install ssh-agent
		- 'which ssh-agent \|\| ( apt-get update -y && apt-get install openssh-client -y )'
		- 'which rsync \|\| ( apt-get update -y && apt-get install rsync -y )'
		# run ssh-agent
		- eval $(ssh-agent -s)
		# add ssh key stored in SSH_PRIVATE_KEY variable to the agent store
		- ssh-add <(echo "$SSH_PRIVATE_KEY")
		# disable host key checking (NOTE: makes you susceptible to man-in-the-middle attacks)
		# WARNING: use only in docker container, if you use it with shell you will overwrite your user's ssh config
		- mkdir -p ~/.ssh
		- echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
		script:
		- chown nginx:nginx site -R
		- rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/site/
		only:
		- master@sccs/docs.it4i.cz
		when: manual

.spelling

0 → 100644

+851 −0

Original line number	Diff line number	Diff line
		Quantum Scalar I6
		JAN
		LUMI
		AI
		CI/CD
		AWS
		CLI
		FAQ
		s3cmd
		GUI
		EESSI
		hipBlas
		hipSolver
		LUMI
		apptainer
		ROCm
		HIP
		NVIDIA DGX-2
		nvidia
		smi
		nvidia-smi
		NICE
		DGX-2
		DGX
		DCV
		In
		CAE
		CUBE
		GPU
		GSL
		LMGC90
		LS-DYNA
		MAPDL
		GPI-2
		COM
		.ssh
		Anselm
		IT4I
		IT4Innovations
		PBS
		vnode
		vnodes
		Salomon
		TurboVNC
		VNC
		DDR3
		DIMM
		InfiniBand
		CUDA
		ORCA
		COMSOL
		API
		GNU
		CUDA
		NVIDIA
		LiveLink
		MATLAB
		Allinea
		LLNL
		Vampir
		Doxygen
		VTune
		TotalView
		Valgrind
		ParaView
		OpenFOAM
		MAX_FAIRSHARE
		MPI4Py
		MPICH2
		PETSc
		Trilinos
		FFTW
		HDF5
		BiERapp
		AVX
		AVX2
		JRE
		JDK
		QEMU
		VMware
		VirtualBox
		NUMA
		SMP
		BLAS
		LAPACK
		FFTW3
		Dongarra
		OpenCL
		cuBLAS
		CESNET
		Jihlava
		NVIDIA
		Xeon
		ANSYS
		CentOS
		RHEL
		DDR4
		DIMMs
		GDDR5
		EasyBuild
		e.g.
		MPICH
		MVAPICH2
		OpenBLAS
		ScaLAPACK
		PAPI
		SGI
		UV2000
		VM
		400GB
		Mellanox
		RedHat
		ssh.du1.cesnet.cz
		ssh.du2.cesnet.cz
		ssh.du3.cesnet.cz
		DECI
		supercomputing
		AnyConnect
		X11
		backfilling
		backfilled
		SCP
		Lustre
		QDR
		TFLOP
		ncpus
		myjob
		pernode
		mpiprocs
		ompthreads
		qprace
		runtime
		SVS
		ppn
		Multiphysics
		aeroacoustics
		turbomachinery
		CFD
		LS-DYNA
		APDL
		MAPDL
		multiphysics
		AUTODYN
		RSM
		Molpro
		initio
		parallelization
		NWChem
		SCF
		ISV
		profiler
		Pthreads
		profilers
		OTF
		PAPI
		PCM
		uncore
		pre-processing
		prepend
		CXX
		prepended
		POMP2
		Memcheck
		unaddressable
		OTF2
		GPI-2
		GASPI
		GPI
		MKL
		IPP
		TBB
		GSL
		Omics
		VNC
		Scalasca
		IFORT
		interprocedural
		IDB
		cloop
		qcow
		qcow2
		vmdk
		vdi
		virtio
		paravirtualized
		Gbit
		tap0
		UDP
		TCP
		preload
		qfat
		Rmpi
		DCT
		datasets
		dataset
		preconditioners
		partitioners
		PARDISO
		PaStiX
		SuiteSparse
		SuperLU
		ExodusII
		NetCDF
		ParMETIS
		multigrid
		HYPRE
		SPAI
		Epetra
		EpetraExt
		Tpetra
		64-bit
		Belos
		GMRES
		Amesos
		IFPACK
		preconditioner
		Teuchos
		Makefiles
		SAXPY
		NVCC
		VCF
		HGMD
		HUMSAVAR
		ClinVar
		indels
		CIBERER
		exomes
		tmp
		SSHFS
		RSYNC
		unmount
		Cygwin
		CygwinX
		RFB
		TightVNC
		TigerVNC
		GUIs
		XLaunch
		UTF-8
		numpad
		PuTTYgen
		OpenSSH
		IE11
		x86
		r21u01n577
		7120P
		interprocessor
		IPN
		toolchains
		toolchain
		APIs
		easyblocks
		GM200
		GeForce
		GTX
		IRUs
		ASIC
		backplane
		ICEX
		IRU
		PFLOP
		T950B
		ifconfig
		inet
		addr
		checkbox
		appfile
		programmatically
		http
		https
		filesystem
		phono3py
		HDF
		splitted
		automize
		llvm
		PGI
		GUPC
		BUPC
		IBV
		Aislinn
		nondeterminism
		stdout
		stderr
		i.e.
		pthreads
		uninitialised
		broadcasted
		ITAC
		hotspots
		Bioinformatics
		semiempirical
		DFT
		polyfill
		ES6
		HTML5Rocks
		minifiers
		CommonJS
		PhantomJS
		bundlers
		Browserify
		versioning
		isflowing
		ispaused
		NPM
		sublicense
		Streams2
		Streams3
		blogpost
		GPG
		mississippi
		Uint8Arrays
		Uint8Array
		endianness
		styleguide
		noop
		MkDocs
		- docs.it4i/anselm-cluster-documentation/environment-and-modules.md
		MODULEPATH
		bashrc
		PrgEnv-gnu
		bullx
		MPI
		PrgEnv-intel
		EasyBuild
		- docs.it4i/anselm-cluster-documentation/capacity-computing.md
		capacity.zip
		README
		- docs.it4i/anselm-cluster-documentation/compute-nodes.md
		DIMMs
		- docs.it4i/anselm-cluster-documentation/hardware-overview.md
		cn
		K20
		Xeon
		x86-64
		Virtualization
		virtualization
		NVIDIA
		5110P
		SSD
		lscratch
		login1
		login2
		dm1
		Rpeak
		LINPACK
		Rmax
		E5-2665
		E5-2470
		P5110
		isw
		- docs.it4i/anselm-cluster-documentation/introduction.md
		RedHat
		- docs.it4i/anselm-cluster-documentation/job-priority.md
		walltime
		qexp
		_List.fairshare
		_time
		_FAIRSHARE
		1E6
		- docs.it4i/anselm-cluster-documentation/job-submission-and-execution.md
		15209.srv11
		qsub
		15210.srv11
		pwd
		cn17.bullx
		cn108.bullx
		cn109.bullx
		cn110.bullx
		pdsh
		hostname
		SCRDIR
		mkdir
		mpiexec
		qprod
		Jobscript
		jobscript
		cn108
		cn109
		cn110
		Name0
		cn17
		_NODEFILE
		_O
		_WORKDIR
		mympiprog.x
		_JOBID
		myprog.x
		openmpi
		- docs.it4i/anselm-cluster-documentation/network.md
		ib0
		- docs.it4i/anselm-cluster-documentation/prace.md
		PRACE
		qfree
		it4ifree
		it4i.portal.clients
		prace
		1h
		- docs.it4i/anselm-cluster-documentation/shell-and-data-access.md
		VPN
		- docs.it4i/anselm-cluster-documentation/software/ansys/ansys-cfx.md
		ANSYS
		CFX
		cfx.pbs
		_r
		ane3fl
		- docs.it4i/anselm-cluster-documentation/software/ansys/ansys-mechanical-apdl.md
		mapdl.pbs
		_dy
		- docs.it4i/anselm-cluster-documentation/software/ansys/ls-dyna.md
		HPC
		lsdyna.pbs
		- docs.it4i/anselm-cluster-documentation/software/chemistry/molpro.md
		OpenMP
		- docs.it4i/anselm-cluster-documentation/software/compilers.md
		Fortran
		- docs.it4i/anselm-cluster-documentation/software/debuggers/intel-performance-counter-monitor.md
		E5-2600
		- docs.it4i/anselm-cluster-documentation/software/debuggers/score-p.md
		Makefile
		- docs.it4i/anselm-cluster-documentation/software/gpi2.md
		gcc
		cn79
		helloworld
		_gpi.c
		ibverbs
		gaspi
		_logger
		- docs.it4i/anselm-cluster-documentation/software/intel-suite/intel-compilers.md
		Haswell
		CPUs
		ipo
		O3
		vec
		xAVX
		omp
		simd
		ivdep
		pragmas
		openmp
		xCORE-AVX2
		axCORE-AVX2
		- docs.it4i/anselm-cluster-documentation/software/kvirtualization.md
		rc.local
		runlevel
		RDP
		DHCP
		DNS
		SMB
		VDE
		smb.conf
		TMPDIR
		run.bat.
		slirp
		NATs
		- docs.it4i/anselm-cluster-documentation/software/mpi/mpi4py-mpi-for-python.md
		NumPy
		- docs.it4i/anselm-cluster-documentation/software/numerical-languages/matlab_1314.md
		mpiLibConf.m
		matlabcode.m
		output.out
		matlabcodefile
		sched
		_feature
		- docs.it4i/anselm-cluster-documentation/software/numerical-languages/matlab.md
		UV2000
		maxNumCompThreads
		SalomonPBSPro
		- docs.it4i/anselm-cluster-documentation/software/numerical-languages/octave.md
		_THREADS
		_NUM
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/trilinos.md
		CMake-aware
		Makefile.export
		_PACKAGE
		_CXX
		_COMPILER
		_INCLUDE
		_DIRS
		_LIBRARY
		- docs.it4i/anselm-cluster-documentation/software/ansys/ansys-ls-dyna.md
		ansysdyna.pbs
		- docs.it4i/anselm-cluster-documentation/software/ansys/ansys.md
		svsfem.cz
		_
		- docs.it4i/anselm-cluster-documentation/software/debuggers/valgrind.md
		libmpiwrap-amd64-linux
		O0
		valgrind
		malloc
		_PRELOAD
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/magma-for-intel-xeon-phi.md
		cn204
		_LIBS
		MAGMAROOT
		_magma
		_server
		_anselm
		_from
		_mic.sh
		_dgetrf
		_mic
		_03.pdf
		- docs.it4i/anselm-cluster-documentation/software/paraview.md
		cn77
		localhost
		v4.0.1
		- docs.it4i/anselm-cluster-documentation/storage.md
		ssh.du1.cesnet.cz
		Plzen
		ssh.du2.cesnet.cz
		ssh.du3.cesnet.cz
		tier1
		_home
		_cache
		_tape
		- docs.it4i/salomon/environment-and-modules.md
		icc
		ictce
		ifort
		imkl
		intel
		gompi
		goolf
		BLACS
		iompi
		iccifort
		- docs.it4i/salomon/hardware-overview.md
		HW
		E5-4627v2
		- docs.it4i/salomon/job-submission-and-execution.md
		15209.isrv5
		r21u01n577
		r21u02n578
		r21u03n579
		r21u04n580
		qsub
		15210.isrv5
		pwd
		r2i5n6.ib0.smc.salomon.it4i.cz
		r4i6n13.ib0.smc.salomon.it4i.cz
		r4i7n2.ib0.smc.salomon.it4i.cz
		pdsh
		r2i5n6
		r4i6n13
		r4i7n
		r4i7n2
		r4i7n0
		SCRDIR
		myjob
		mkdir
		mympiprog.x
		mpiexec
		myprog.x
		r4i7n0.ib0.smc.salomon.it4i.cz
		- docs.it4i/salomon/7d-enhanced-hypercube.md
		cns1
		cns576
		r1i0n0
		r4i7n17
		cns577
		cns1008
		r37u31n1008
		7D
		- docs.it4i/anselm-cluster-documentation/resources-allocation-policy.md
		qsub
		it4ifree
		it4i.portal.clients
		x86
		x64
		- docs.it4i/anselm-cluster-documentation/software/ansys/ansys-fluent.md
		anslic
		_admin
		- docs.it4i/anselm-cluster-documentation/software/chemistry/nwchem.md
		_DIR
		- docs.it4i/anselm-cluster-documentation/software/comsol-multiphysics.md
		EDU
		comsol
		_matlab.pbs
		_job.m
		mphstart
		- docs.it4i/anselm-cluster-documentation/software/debuggers/allinea-performance-reports.md
		perf-report
		perf
		txt
		html
		mympiprog
		_32p
		- docs.it4i/anselm-cluster-documentation/software/debuggers/intel-vtune-amplifier.md
		Hotspots
		- docs.it4i/anselm-cluster-documentation/software/debuggers/scalasca.md
		scorep
		- docs.it4i/anselm-cluster-documentation/software/isv_licenses.md
		edu
		ansys
		_features
		_state.txt
		f1
		matlab
		acfd
		_ansys
		_acfd
		_aa
		_comsol
		HEATTRANSFER
		_HEATTRANSFER
		COMSOLBATCH
		_COMSOLBATCH
		STRUCTURALMECHANICS
		_STRUCTURALMECHANICS
		_matlab
		_Toolbox
		_Image
		_Distrib
		_Comp
		_Engine
		_Acquisition
		pmode
		matlabpool
		- docs.it4i/anselm-cluster-documentation/software/mpi/mpi.md
		mpirun
		BLAS1
		FFT
		KMP
		_AFFINITY
		GOMP
		_CPU
		bullxmpi-1
		mpich2
		- docs.it4i/anselm-cluster-documentation/software/mpi/Running_OpenMPI.md
		bysocket
		bycore
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/fftw.md
		gcc3.3.3
		pthread
		fftw3
		lfftw3
		_threads-lfftw3
		_omp
		icc3.3.3
		FFTW2
		gcc2.1.5
		fftw2
		lfftw
		_threads
		icc2.1.5
		fftw-mpi3
		_mpi
		fftw3-mpi
		fftw2-mpi
		IntelMPI
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/gsl.md
		dwt.c
		mkl
		lgsl
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/hdf5.md
		icc
		hdf5
		_INC
		_SHLIB
		_CPP
		_LIB
		_F90
		gcc49
		- docs.it4i/anselm-cluster-documentation/software/numerical-libraries/petsc.md
		_Dist
		- docs.it4i/anselm-cluster-documentation/software/nvidia-cuda.md
		lcublas
		- docs.it4i/anselm-cluster-documentation/software/operating-system.md
		6.x
		- docs.it4i/get-started-with-it4innovations/accessing-the-clusters/graphical-user-interface/cygwin-and-x11-forwarding.md
		startxwin
		cygwin64binXWin.exe
		tcp
		- docs.it4i/get-started-with-it4innovations/accessing-the-clusters/graphical-user-interface/x-window-system.md
		Xming
		XWin.exe.
		- docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/pageant.md
		_rsa.ppk
		- docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/puttygen.md
		_keys
		organization.example.com
		_rsa
		- docs.it4i/get-started-with-it4innovations/accessing-the-clusters/shell-access-and-data-transfer/vpn-connection-fail-in-win-8.1.md
		vpnui.exe
		- docs.it4i/salomon/ib-single-plane-topology.md
		36-port
		Mcell.pdf
		r21-r38
		nodes.pdf
		- docs.it4i/salomon/introduction.md
		E5-2680v3
		- docs.it4i/salomon/network.md
		r4i1n0
		r4i1n1
		r4i1n2
		r4i1n3
		ip
		- docs.it4i/salomon/software/ansys/setting-license-preferences.md
		ansys161
		- docs.it4i/salomon/software/ansys/workbench.md
		mpifile.txt
		solvehandlers.xml
		- docs.it4i/salomon/software/chemistry/phono3py.md
		vasprun.xml
		disp-XXXXX
		disp
		_fc3.yaml
		ir
		_grid
		_points.yaml
		gofree-cond1
		- docs.it4i/salomon/software/compilers.md
		HPF
		- docs.it4i/salomon/software/comsol/licensing-and-available-versions.md
		ver
		- docs.it4i/salomon/software/debuggers/aislinn.md
		test.cpp
		- docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md
		vtune
		_update1
		- docs.it4i/salomon/software/debuggers/valgrind.md
		EBROOTVALGRIND
		- docs.it4i/salomon/software/intel-suite/intel-advisor.md
		O2
		- docs.it4i/salomon/software/intel-suite/intel-compilers.md
		UV1
		- docs.it4i/salomon/software/numerical-languages/octave.md
		octcode.m
		mkoctfile
		- docs.it4i/software/orca.md
		pdf
		- node_modules/es6-promise/README.md
		rsvp.js
		es6-promise
		es6-promise-min
		Node.js
		testem
		- node_modules/spawn-sync/lib/json-buffer/README.md
		node.js
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/doc/wg-meetings/2015-01-30.md
		WG
		domenic
		mikeal
		io.js
		sam
		calvin
		whatwg
		compat
		mathias
		isaac
		chris
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/core-util-is/README.md
		core-util-is
		v0.12.
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/isarray/README.md
		isarray
		Gruber
		julian
		juliangruber.com
		NONINFRINGEMENT
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/process-nextick-args/license.md
		Metcalf
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/process-nextick-args/readme.md
		process-nextick-args
		process.nextTick
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/string_decoder/README.md
		_decoder.js
		Joyent
		joyent
		repo
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/util-deprecate/History.md
		kumavis
		jsdocs
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/node_modules/util-deprecate/README.md
		util-deprecate
		Rajlich
		- node_modules/spawn-sync/node_modules/concat-stream/node_modules/readable-stream/README.md
		v7.0.0
		userland
		chrisdickinson
		christopher.s.dickinson
		gmail.com
		9554F04D7259F04124DE6B476D5A82AC7E37093B
		calvinmetcalf
		calvin.metcalf
		F3EF5F62A87FC27A22E643F714CE4FF5015AA242
		Vagg
		rvagg
		vagg.org
		DD8F2338BAE7501E3DD5AC78C273792F7D83545D
		sonewman
		newmansam
		outlook.com
		Buus
		mafintosh
		mathiasbuus
		Denicola
		domenic.me
		Matteo
		Collina
		mcollina
		matteo.collina
		3ABC01543F22DD2239285CDD818674489FBC127E
		- node_modules/spawn-sync/node_modules/concat-stream/readme.md
		concat-stream
		concat
		cb
		- node_modules/spawn-sync/node_modules/os-shim/README.md
		0.10.x
		os.tmpdir
		os.endianness
		os.EOL
		os.platform
		os.arch
		0.4.x
		Aparicio
		Adesis
		Netlife
		S.L
		- node_modules/spawn-sync/node_modules/try-thread-sleep/node_modules/thread-sleep/README.md
		node-pre-gyp
		npm
		- node_modules/spawn-sync/README.md
		iojs
		UCX
		Dask-ssh
		SCRATCH
		HOME
		PROJECT
		e-INFRA
		e-INFRA CZ
		DICE
		qgpu
		qcpu
		it4i-portal-clients
		it4icheckaccess
		it4idedicatedtime
		it4ifree
		it4ifsusage
		it4iuserfsusage
		it4iprojectfsusage
		it4imotd
		e-INFRA
		it4i-portal-clients
		s3cmd
		s5cmd
		title:
		e-INFRA CZ Cloud Ostrava
		e-INFRA CZ Account

README.md

0 → 100644

+8 −0

Original line number	Diff line number	Diff line
		# IT4Inovations Documentation

		This project contains IT4Innovations user documentation source.

		## Migration

		* [fumadocs](https://fumadocs.vercel.app/)
		No newline at end of file

docs.it4i/anselm/compute-nodes.md

0 → 100644

+132 −0

Original line number	Diff line number	Diff line
		# Compute Nodes

		## Node Configuration

		Anselm is a cluster of x86-64 Intel-based nodes built with the Bull Extreme Computing bullx technology. The cluster contains four types of compute nodes.

		### Compute Nodes Without Accelerators

		* 180 nodes
		* 2880 cores in total
		* two Intel Sandy Bridge E5-2665, 8-core, 2.4GHz processors per node
		* 64 GB of physical memory per node
		* one 500GB SATA 2,5” 7,2 krpm HDD per node
		* bullx B510 blade servers
		* cn[1-180]

		### Compute Nodes With a GPU Accelerator

		* 23 nodes
		* 368 cores in total
		* two Intel Sandy Bridge E5-2470, 8-core, 2.3GHz processors per node
		* 96 GB of physical memory per node
		* one 500GB SATA 2,5” 7,2 krpm HDD per node
		* GPU accelerator 1x NVIDIA Tesla Kepler K20m per node
		* bullx B515 blade servers
		* cn[181-203]

		### Compute Nodes With a MIC Accelerator

		* 4 nodes
		* 64 cores in total
		* two Intel Sandy Bridge E5-2470, 8-core, 2.3GHz processors per node
		* 96 GB of physical memory per node
		* one 500GB SATA 2,5” 7,2 krpm HDD per node
		* MIC accelerator 1x Intel Phi 5110P per node
		* bullx B515 blade servers
		* cn[204-207]

		### Fat Compute Nodes

		* 2 nodes
		* 32 cores in total
		* 2 Intel Sandy Bridge E5-2665, 8-core, 2.4GHz processors per node
		* 512 GB of physical memory per node
		* two 300GB SAS 3,5” 15krpm HDD (RAID1) per node
		* two 100GB SLC SSD per node
		* bullx R423-E3 servers
		* cn[208-209]

		![](../img/bullxB510.png)
		Anselm bullx B510 servers

		### Compute Node Summary

		\| Node type \| Count \| Range \| Memory \| Cores \| Queues \|
		\| ---------------------------- \| ----- \| ----------- \| ------ \| ----------- \| -------------------------------------- \|
		\| Nodes without an accelerator \| 180 \| cn[1-180] \| 64GB \| 16 @ 2.4GHz \| qexp, qprod, qlong, qfree, qprace, qatlas \|
		\| Nodes with a GPU accelerator \| 23 \| cn[181-203] \| 96GB \| 16 @ 2.3GHz \| qnvidia, qexp \|
		\| Nodes with a MIC accelerator \| 4 \| cn[204-207] \| 96GB \| 16 @ 2.3GHz \| qmic, qexp \|
		\| Fat compute nodes \| 2 \| cn[208-209] \| 512GB \| 16 @ 2.4GHz \| qfat, qexp \|

		## Processor Architecture

		Anselm is equipped with Intel Sandy Bridge processors Intel Xeon E5-2665 (nodes without accelerators and fat nodes) and Intel Xeon E5-2470 (nodes with accelerators). The processors support Advanced Vector Extensions (AVX) 256-bit instruction set.

		### Intel Sandy Bridge E5-2665 Processor

		* eight-core
		* speed: 2.4 GHz, up to 3.1 GHz using Turbo Boost Technology
		* peak performance: 19.2 GFLOP/s per core
		* caches:
		* L2: 256 KB per core
		* L3: 20 MB per processor
		* memory bandwidth at the level of the processor: 51.2 GB/s

		### Intel Sandy Bridge E5-2470 Processor

		* eight-core
		* speed: 2.3 GHz, up to 3.1 GHz using Turbo Boost Technology
		* peak performance: 18.4 GFLOP/s per core
		* caches:
		* L2: 256 KB per core
		* L3: 20 MB per processor
		* memory bandwidth at the level of the processor: 38.4 GB/s

		Nodes equipped with Intel Xeon E5-2665 CPU have a set PBS resource attribute cpu_freq = 24, nodes equipped with Intel Xeon E5-2470 CPU have set PBS resource attribute cpu_freq = 23.

		```console
		$ qsub -A OPEN-0-0 -q qprod -l select=4:ncpus=16:cpu_freq=24 -I
		```

		In this example, we allocate 4 nodes, 16 cores at 2.4GHhz per node.

		Intel Turbo Boost Technology is used by default, you can disable it for all nodes of job by using the cpu_turbo_boost resource attribute.

		```console
		$ qsub -A OPEN-0-0 -q qprod -l select=4:ncpus=16 -l cpu_turbo_boost=0 -I
		```

		## Memmory Architecture

		The cluster contains three types of compute nodes.

		### Compute Nodes Without Accelerators

		* 2 sockets
		* Memory Controllers are integrated into processors.
		* 8 DDR3 DIMMs per node
		* 4 DDR3 DIMMs per CPU
		* 1 DDR3 DIMMs per channel
		* Data rate support: up to 1600MT/s
		* Populated memory: 8 x 8 GB DDR3 DIMM 1600 MHz

		### Compute Nodes With a GPU or MIC Accelerator

		* 2 sockets
		* Memory Controllers are integrated into processors.
		* 6 DDR3 DIMMs per node
		* 3 DDR3 DIMMs per CPU
		* 1 DDR3 DIMMs per channel
		* Data rate support: up to 1600MT/s
		* Populated memory: 6 x 16 GB DDR3 DIMM 1600 MHz

		### Fat Compute Nodes

		* 2 sockets
		* Memory Controllers are integrated into processors.
		* 16 DDR3 DIMMs per node
		* 8 DDR3 DIMMs per CPU
		* 2 DDR3 DIMMs per channel
		* Data rate support: up to 1600MT/s
		* Populated memory: 16 x 32 GB DDR3 DIMM 1600 MHz

docs.it4i/anselm/hardware-overview.md

0 → 100644

+68 −0

Original line number	Diff line number	Diff line
		# Hardware Overview

		The Anselm cluster consists of 209 computational nodes named cn[1-209] of which 180 are regular compute nodes, 23 are GPU Kepler K20 accelerated nodes, 4 are MIC Xeon Phi 5110P accelerated nodes, and 2 are fat nodes. Each node is a powerful x86-64 computer, equipped with 16 cores (two eight-core Intel Sandy Bridge processors), at least 64 GB of RAM, and a local hard drive. User access to the Anselm cluster is provided by two login nodes login[1,2]. The nodes are interlinked through high speed InfiniBand and Ethernet networks. All nodes share a 320 TB /home disk for storage of user files. The 146 TB shared /scratch storage is available for scratch data.

		The Fat nodes are equipped with a large amount (512 GB) of memory. Virtualization infrastructure provides resources to run long-term servers and services in virtual mode. Fat nodes and virtual servers may access 45 TB of dedicated block storage. Accelerated nodes, fat nodes, and virtualization infrastructure are available [upon request][a] from a PI.

		Schematic representation of the Anselm cluster. Each box represents a node (computer) or storage capacity:

		![](../img/Anselm-Schematic-Representation.png)

		The cluster compute nodes cn[1-207] are organized within 13 chassis.

		There are four types of compute nodes:

		* 180 compute nodes without an accelerator
		* 23 compute nodes with a GPU accelerator - an NVIDIA Tesla Kepler K20m
		* 4 compute nodes with a MIC accelerator - an Intel Xeon Phi 5110P
		* 2 fat nodes - equipped with 512 GB of RAM and two 100 GB SSD drives

		[More about Compute nodes][1].

		GPU and accelerated nodes are available upon request, see the [Resources Allocation Policy][2].

		All of these nodes are interconnected through fast InfiniBand and Ethernet networks. [More about the Network][3].
		Every chassis provides an InfiniBand switch, marked isw, connecting all nodes in the chassis, as well as connecting the chassis to the upper level switches.

		All of the nodes share a 360 TB /home disk for storage of user files. The 146 TB shared /scratch storage is available for scratch data. These file systems are provided by the Lustre parallel file system. There is also local disk storage available on all compute nodes in /lscratch. [More about Storage][4].

		User access to the Anselm cluster is provided by two login nodes login1, login2, and data mover node dm1. [More about accessing the cluster][5].

		The parameters are summarized in the following tables:

		\| In general \| \|
		\| ------------------------------------------- \| -------------------------------------------- \|
		\| Primary purpose \| High Performance Computing \|
		\| Architecture of compute nodes \| x86-64 \|
		\| Operating system \| Linux (CentOS) \|
		\| [Compute nodes][1] \| \|
		\| Total \| 209 \|
		\| Processor cores \| 16 (2 x 8 cores) \|
		\| RAM \| min. 64 GB, min. 4 GB per core \|
		\| Local disk drive \| yes - usually 500 GB \|
		\| Compute network \| InfiniBand QDR, fully non-blocking, fat-tree \|
		\| w/o accelerator \| 180, cn[1-180] \|
		\| GPU accelerated \| 23, cn[181-203] \|
		\| MIC accelerated \| 4, cn[204-207] \|
		\| Fat compute nodes \| 2, cn[208-209] \|
		\| In total \| \|
		\| Total theoretical peak performance (Rpeak) \| 94 TFLOP/s \|
		\| Total max. LINPACK performance (Rmax) \| 73 TFLOP/s \|
		\| Total amount of RAM \| 15.136 TB \|

		\| Node \| Processor \| Memory \| Accelerator \|
		\| ---------------- \| --------------------------------------- \| ------ \| -------------------- \|
		\| w/o accelerator \| 2 x Intel Sandy Bridge E5-2665, 2.4 GHz \| 64 GB \| - \|
		\| GPU accelerated \| 2 x Intel Sandy Bridge E5-2470, 2.3 GHz \| 96 GB \| NVIDIA Kepler K20m \|
		\| MIC accelerated \| 2 x Intel Sandy Bridge E5-2470, 2.3 GHz \| 96 GB \| Intel Xeon Phi 5110P \|
		\| Fat compute node \| 2 x Intel Sandy Bridge E5-2665, 2.4 GHz \| 512 GB \| - \|

		For more details, refer to [Compute nodes][1], [Storage][4], and [Network][3].

		[1]: compute-nodes.md
		[2]: ../general/resources-allocation-policy.md
		[3]: network.md
		[4]: storage.md
		[5]: ../general/shell-and-data-access.md

		[a]: https://support.it4i.cz/rt

docs.it4i/anselm/introduction.md

0 → 100644

+20 −0

Original line number	Diff line number	Diff line
		# Introduction

		Welcome to the Anselm supercomputer cluster. The Anselm cluster consists of 209 compute nodes, totaling 3344 compute cores with 15 TB RAM, giving over 94 TFLOP/s theoretical peak performance. Each node is a powerful x86-64 computer, equipped with 16 cores, at least 64 GB of RAM, and a 500 GB hard disk drive. Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network and are equipped with Intel Sandy Bridge processors. A few nodes are also equipped with NVIDIA Kepler GPU or Intel Xeon Phi MIC accelerators. Read more in [Hardware Overview][1].

		Anselm runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains. These packages are accessible via the [modules environment][2].

		The user data shared file-system (HOME, 320 TB) and job data shared file-system (SCRATCH, 146 TB) are available to users.

		The PBS Professional workload manager provides [computing resources allocations and job execution][3].

		Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].

		[1]: hardware-overview.md
		[2]: ../environment-and-modules.md
		[3]: ../general/resources-allocation-policy.md
		[4]: ../general/applying-for-resources.md
		[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
		[6]: ../general/shell-and-data-access.md

		[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg

docs.it4i/anselm/network.md

0 → 100644

+38 −0

Original line number	Diff line number	Diff line
		# Network

		All of the compute and login nodes of Anselm are interconnected through an [InfiniBand][a] QDR network and a Gigabit [Ethernet][b] network. Both networks may be used to transfer user data.

		## InfiniBand Network

		All of the compute and login nodes of Anselm are interconnected through a high-bandwidth, low-latency [InfiniBand][a] QDR network (IB 4 x QDR, 40 Gbps). The network topology is a fully non-blocking fat-tree.

		The compute nodes may be accessed via the InfiniBand network using the ib0 network interface, in address range 10.2.1.1-209. The MPI may be used to establish native InfiniBand connection among the nodes.

		!!! note
		The network provides 2170 MB/s transfer rates via the TCP connection (single stream) and up to 3600 MB/s via the native InfiniBand protocol.

		The Fat tree topology ensures that peak transfer rates are achieved between any two nodes, independent of network traffic exchanged among other nodes concurrently.

		## Ethernet Network

		The compute nodes may be accessed via the regular Gigabit Ethernet network interface eth0, in the address range 10.1.1.1-209, or by using aliases cn1-cn209. The network provides 114 MB/s transfer rates via the TCP connection.

		## Example

		In this example, we access the node cn110 through the InfiniBand network via the ib0 interface, then from cn110 to cn108 through the Ethernet network.

		```console
		$ qsub -q qexp -l select=4:ncpus=16 -N Name0 ./myjob
		$ qstat -n -u username
		Req'd Req'd Elap
		Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
		--------------- -------- -- \|---\|---\| ------ --- --- ------ ----- - -----
		15209.srv11 username qexp Name0 5530 4 64 -- 01:00 R 00:00
		cn17/016+cn108/016+cn109/016+cn110/016

		$ ssh 10.2.1.110
		$ ssh 10.1.1.108
		```

		[a]: http://en.wikipedia.org/wiki/InfiniBand
		[b]: http://en.wikipedia.org/wiki/Ethernet

docs.it4i/anselm/storage.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/apiv1.md

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		# API Placeholder

		This page is created automatically from the API source code.

docs.it4i/apiv2.md

0 → 100644

+203 −0

Original line number	Diff line number	Diff line
		# SCS API v2

		## Info

		- OpenAPI: 3.1.0
		- Title: scs-api-2
		- Version: 0.1.0
		- Server URL: `https://scs.it4i.cz/api/v2`

		## Paths

		### `/dedicated-time`

		GET

		- Summary: Get dedicated times
		- Description: Retrieves dedicated time entries, optionally filtered by cluster name or period preset
		- OperationId: `dedicated_time_handler`

		Parameters:

		- `cluster` (query): Filter by cluster name; Available values: karolina, barbora, dgx (optional)
		- `period` (query): Filter by time period preset; Available values: planned, active (optional)

		Responses:

		- `200`: List of dedicated time entries
		- `400`: Failed to deserialize query, Invalid cluster, Invalid period
		Example:

		```json
		{
		"message": "Invalid cluster: el_gordo"
		}
		```
		- `500`: Failed to retrieve dedicated time due to a server error
		Example:
		```json
		{
		"message": "Failed to retreive dedicated time"
		}
		```

		### `/dedicated-time-calendar`

		GET

		- Summary: Get dedicated times
		- Description: Retrieves dedicated time entries and generates a VCalendar response.
		- OperationId: `dedicated_time_calendar`

		Responses:

		- `200`: Dedicated time VCalendar
		Example:

		```
		BEGIN:VCALENDAR
		VERSION:2.0
		PRODID:-//SUTD Timetable Calendar//randName//EN
		CALSCALE:GREGORIAN
		BEGIN:VEVENT
		UID:1234@example.com
		DTSTAMP:20230101T000000Z
		DTSTART:20230101T000000Z
		DTEND:20230102T000000Z
		SUMMARY:Sample Dedicated Time - Cluster Outage
		DESCRIPTION:Sample Dedicated Time - Cluster Outage
		END:VEVENT
		END:VCALENDAR
		```

		- `500`: Failed to retrieve dedicated time calendar
		Example:

		```json
		{
		"message": "Failed to retreive dedicated time calendar"
		}
		```

		### `/motd`

		GET

		- Summary: Get messages of the day
		- Description: Retrieves messages of the day, optionally filtered by category
		- OperationId: `motd`

		Parameters:

		- `category` (query): (optional)

		Responses:

		- `200`: List of motd entries
		- `400`: Failed to deserialize query, Invalid motd category
		- `500`: Failed to retrieve motd entries due to a server error
		Example:

		```json
		{
		"message": "Failed to retrieve motd"
		}
		```

		## Components

		### Schemas

		#### DedicatedTime

		```yaml
		type: object
		required:
		- updated_at
		properties:
		cluster_type:
		type: [string, 'null']
		date_efficiency:
		type: [string, 'null']
		format: date-time
		date_expiration:
		type: [string, 'null']
		format: date-time
		updated_at:
		type: string
		format: date-time
		```

		#### Motd

		```yaml
		type: object
		required:
		- id
		- author
		- category
		- created_at
		- updated_at
		- date_modification
		- title
		- message_body
		- systems
		properties:
		id:
		type: integer
		format: int32
		examples: [1]
		author:
		type: string
		examples: [Admin]
		category:
		type: string
		examples: [public-service-announcement]
		created_at:
		type: string
		format: date-time
		updated_at:
		type: string
		format: date-time
		date_modification:
		type: string
		format: date-time
		date_efficiency:
		type: [string, 'null']
		format: date-time
		date_expiration:
		type: [string, 'null']
		format: date-time
		date_outage_efficiency:
		type: [string, 'null']
		format: date-time
		date_outage_expiration:
		type: [string, 'null']
		format: date-time
		title:
		type: string
		examples: [Important Update]
		message_body:
		type: string
		examples: [We are experiencing some service disruptions.]
		systems:
		type: array
		items:
		type: string
		examples: [Karolina]
		```

		#### MsgResponse

		```yaml
		type: object
		description: \|
		Common struct for DTO-less responses
		eg. ```200 {"message":"Operation succeeded"}```
		required:
		- message
		properties:
		message:
		type: string
		examples: [API response]
		```

docs.it4i/archive/archive-intro.md

0 → 100644

+50 −0

Original line number	Diff line number	Diff line
		# Introduction

		This section contains documentation of decommissioned IT4Innovations' supercomputers and services.

		## Salomon

		The second supercomputer, built by SGI (now Hewlett Packard Enterprise), was launched in 2015. With a performance of 2 PFlop/s, it was immediately included in the TOP500 list, which ranks the world's most powerful supercomputers. It stayed there until November 2020, falling from the 40th place to 460th.

		Salomon was decommissioned after six years - at the end of 2021.

		### Interesting Facts

		\| Salomon's facts \| \|
		\| ---------------------------- \| ------------------ \|
		\| In operation \| Q2 2015 - Q4 2021 \|
		\| Theoretical peak performance \| 2 PFLOP/s \|
		\| Number of nodes \| 1,008 \|
		\| HOME storage capacity \| 500 TB \|
		\| SCRATCH storage capacity \| 1,638 TB \|
		\| Projects computed \| 1,085 \|
		\| Computing jobs run \| ca. 8,700,000 \|
		\| Corehours used \| ca. 1,014,000,000 \|

		## Anselm

		The first supercomputer, built by Atos, was launched in 2013. For the first 3 years, it was placed in makeshift containers on the campus of VSB – Technical University of Ostrava, and was subsequently moved to the data room of the newly constructed IT4Innovations building. Anselm's computational resources were available to Czech and foreign students and scientists in fields such as material sciences, computational chemistry, biosciences, and engineering.

		At the end of January 2021, after more than seven years, its operation permanently ceased. In the future, it will be a part of the [World of Civilization exhibition][a] in Lower Vitkovice.

		### Interesting Facts

		\| Anselm's facts \| \|
		\| ---------------------------- \| ------------------ \|
		\| Cost \| 90,000,000 CZK \|
		\| In operation \| Q2 2013 - Q1 2021 \|
		\| Theoretical peak performance \| 94 TFLOP/s \|
		\| Number of nodes \| 209 \|
		\| HOME storage capacity \| 320 TB \|
		\| SCRATCH storage capacity \| 146 TB \|
		\| Projects computed \| 725 \|
		\| Computing jobs run \| 2,630,567 \|
		\| Corehours used \| 134,130,309 \|
		\| Power consumption \| 77 kW \|

		## PRACE

		Partnership for Advanced Computing in Europe aims to facilitate the access to a research infrastructure that enables high-impact scientific discovery and engineering research and development across all disciplines to enhance European competitiveness for the benefit of society. For more information, see the [official website][b].

		[a]: https://www.dolnivitkovice.cz/en/science-and-technology-centre/exhibitions/
		[b]: https://prace-ri.eu/

docs.it4i/barbora-ng/hardware-overview.md

0 → 100644

+46 −0

Original line number	Diff line number	Diff line
		# Hardware Overview

		!!!important Work in progress
		Barbora NG documentation is a WIP.
		The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

		The launch of Barbora NG is planned for October/November.
		In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

		Barbora NG consists of 141 non-accelerated compute nodes named cn[001-141].
		Each node is a powerful x86-64 computer equipped with 192 cores
		(2x Intel Xeon 6952P with 96 CPU cores) and 768 GB RAM.
		User access to the Barbora NG cluster is provided by two login nodes login[1-2].
		The nodes are interlinked through high speed InfiniBand NDR and Ethernet networks.

		The parameters are summarized in the following tables:

		\| In general \| \|
		\| ------------------------------------ \| --------------------- \|
		\| Architecture of compute nodes \| x86-64 \|
		\| Operating system \| Linux \|
		\| [Compute nodes][1] \| \|
		\| Total \| 141 \|
		\| Processor Type \| [Intel Xeon 6952P][b] \|
		\| Architecture \| Granite Rapids \|
		\| Processor cores \| 96 \|
		\| Processors per node \| 2 \|
		\| RAM \| 768 GB \|
		\| Local disk drive \| no \|
		\| Compute network \| InfiniBand HDR \|
		\| non-accelerated \| 141, cn[001-141] \|
		\| In total \| \|
		\| Theoretical peak performance (Rpeak) \| ??? TFLOP/s \|
		\| Cores \| 27072 \|
		\| RAM \| 108.288 TB \|

		[1]: compute-nodes.md
		[2]: ../general/resources-allocation-policy.md
		[3]: network.md
		[4]: storage.md
		[5]: ../general/shell-and-data-access.md
		[6]: visualization.md

		[a]: https://support.it4i.cz/rt
		[b]: https://www.intel.com/content/www/us/en/products/sku/241643/intel-xeon-6952p-processor-480m-cache-2-10-ghz/specifications.html
		No newline at end of file

docs.it4i/barbora-ng/introduction.md

0 → 100644

+36 −0

Original line number	Diff line number	Diff line
		# Introduction

		!!!important Work in progress
		Barbora NG documentation is a WIP.
		The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.

		The launch of Barbora NG is planned for October/November.
		In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.

		Welcome to Barbora Next Gen (NG) supercomputer cluster.
		Barbora NG is our latest supercomputer which consists of 141 compute nodes,
		totaling 27072 compute cores with 108288 GB RAM, giving over ??? TFLOP/s theoretical peak performance.

		Nodes are interconnected through a fully non-blocking fat-tree InfiniBand NDR network
		and are equipped with Intel Granite Rapids processors.
		Read more in [Hardware Overview][1].

		The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains.
		These packages are accessible via the [modules environment][2].

		The user data shared file system and job data shared file system are available to users.

		The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

		Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].


		[1]: hardware-overview.md
		[2]: ../environment-and-modules.md
		[3]: ../general/resources-allocation-policy.md
		[4]: ../general/applying-for-resources.md
		[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
		[6]: ../general/shell-and-data-access.md

		[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
		[b]: https://slurm.schedmd.com/

docs.it4i/barbora/compute-nodes.md

0 → 100644

+146 −0

Original line number	Diff line number	Diff line
		# Compute Nodes

		Barbora is a cluster of x86-64 Intel-based nodes built with the BullSequana Computing technology.
		The cluster contains three types of compute nodes.

		## Compute Nodes Without Accelerators

		* 192 nodes
		* 6912 cores in total
		* 2x Intel Cascade Lake 6240, 18-core, 2.6 GHz processors per node
		* 192 GB DDR4 2933 MT/s of physical memory per node (12x16 GB)
		* BullSequana X1120 blade servers
		* 2995.2 GFLOP/s per compute node
		* 1x 1 GB Ethernet
		* 1x HDR100 IB port
		* 3 compute nodes per X1120 blade server
		* cn[1-192]

		![](img/BullSequanaX1120.png)

		## Compute Nodes With a GPU Accelerator

		* 8 nodes
		* 192 cores in total
		* two Intel Skylake Gold 6126, 12-core, 2.6 GHz processors per node
		* 192 GB DDR4 2933MT/s with ECC of physical memory per node (12x16 GB)
		* 4x GPU accelerator NVIDIA Tesla V100-SXM2 per node
		* Bullsequana X410-E5 NVLink-V blade servers
		* 1996.8 GFLOP/s per compute nodes
		* GPU-to-GPU All-to-All NVLINK 2.0, GPU-Direct
		* 1 GB Ethernet
		* 2x HDR100 IB ports
		* cn[193-200]

		![](img/BullSequanaX410E5GPUNVLink.jpg)

		## Fat Compute Node

		* 1x BullSequana X808 server
		* 128 cores in total
		* 8 Intel Skylake 8153, 16-core, 2.0 GHz, 125 W
		* 6144 GiB DDR4 2667 MT/s of physical memory per node (92x64 GB)
		* 2x HDR100 IB port
		* 8192 GFLOP/s
		* cn[201]

		![](img/BullSequanaX808.jpg)

		## Compute Node Summary

		\| Node type \| Count \| Range \| Memory \| Cores \|
		\| ---------------------------- \| ----- \| ----------- \| -------- \| ------------- \|
		\| Nodes without an accelerator \| 192 \| cn[1-192] \| 192 GB \| 36 @ 2.6 GHz \|
		\| Nodes with a GPU accelerator \| 8 \| cn[193-200] \| 192 GB \| 24 @ 2.6 GHz \|
		\| Fat compute nodes \| 1 \| cn[201] \| 6144 GiB \| 128 @ 2.0 GHz \|

		## Processor Architecture

		Barbora is equipped with Intel Cascade Lake processors Intel Xeon 6240 (nodes without accelerators),
		Intel Skylake Gold 6126 (nodes with accelerators) and Intel Skylake Platinum 8153.

		### Intel [Cascade Lake 6240][d]

		Cascade Lake core is largely identical to that of [Skylake's][a].
		For in-depth detail of the Skylake core/pipeline see [Skylake (client) § Pipeline][b].

		Xeon Gold 6240 is a 64-bit 18-core x86 multi-socket high performance server microprocessor set to be introduced by Intel in late 2018. This chip supports up to 4-way multiprocessing. The Gold 6240, which is based on the Cascade Lake microarchitecture and is manufactured on a 14 nm process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2.6 GHz with a TDP of 150 W and a turbo boost frequency of up to 3.9 GHz, supports up 1 TB of hexa-channel DDR4-2933 ECC memory.

		* Family: Xeon Gold
		* Cores: 18
		* Threads: 36
		* L1I Cache: 576 KiB, 18x32 KiB, 8-way set associative
		* L1D Cache: 576 KiB, 18x32 KiB, 8-way set associative, write-back
		* L2 Cache: 18 MiB, 18x1 MiB, 16-way set associative, write-back
		* L3 Cache: 24.75 MiB, 18x1.375 MiB, 11-way set associative, write-back
		* Instructions: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512 (New instructions for [Vector Neural Network Instructions][c])
		* Frequency: 2.6 GHz
		* Max turbo: 3.9 GHz
		* Process: 14 nm
		* TDP: 140+ W

		### Intel [Skylake Gold 6126][e]

		Xeon Gold 6126 is a 64-bit dodeca-core x86 multi-socket high performance server microprocessor introduced by Intel in mid-2017. This chip supports up to 4-way multiprocessing. The Gold 6126, which is based on the server configuration of the Skylake microarchitecture and is manufactured on a 14 nm+ process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2.6 GHz with a TDP of 125 W and a turbo boost frequency of up to 3.7 GHz, supports up to 768 GiB of hexa-channel DDR4-2666 ECC memory.

		* Family: Xeon Gold
		* Cores: 12
		* Threads: 24
		* L1I Cache: 384 KiB, 12x32 KiB, 8-way set associative
		* L1D Cache: 384 KiB, 12x32 KiB, 8-way set associative, write-back
		* L2 Cache: 12 MiB, 12x1 MiB, 16-way set associative, write-back
		* L3 Cache: 19.25 MiB, 14x1.375 MiB, 11-way set associative, write-back
		* Instructions: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512
		* Frequency: 2.6 GHz
		* Max turbo: 3.7 GHz
		* Process: 14 nm
		* TDP: 125 W

		### Intel [Skylake Platinum 8153][f]

		Xeon Platinum 8153 is a 64-bit 16-core x86 multi-socket highest performance server microprocessor introduced by Intel in mid-2017. This chip supports up to 8-way multiprocessing. The Platinum 8153, which is based on the server configuration of the Skylake microarchitecture and is manufactured on a 14 nm+ process, sports 2 AVX-512 FMA units as well as three Ultra Path Interconnect links. This microprocessor, which operates at 2 GHz with a TDP of 125 W and a turbo boost frequency of up to 2.8 GHz, supports up to 768 GiB of hexa-channel DDR4-2666 ECC memory.

		* Family: Xeon Platinum
		* Cores: 16
		* Threads: 32
		* L1I Cache: 512 KiB, 16x32 KiB, 8-way set associative
		* L1D Cache: 512 KiB, 16x32 KiB, 8-way set associative, write-back
		* L2 Cache: 16 MiB, 16x1 MiB, 16-way set associative, write-back
		* L3 Cache: 22 MiB, 16x1.375 MiB, 11-way set associative, write-back
		* Instructions: x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA3, F16C, BMI, BMI2, VT-x, VT-d, TXT, TSX, RDSEED, ADCX, PREFETCHW, CLFLUSHOPT, XSAVE, SGX, MPX, AVX-512
		* Frequency: 2.0 GHz
		* Max turbo: 2.8 GHz
		* Process: 14 nm
		* TDP: 125 W

		## GPU Accelerator

		Barbora is equipped with an [NVIDIA Tesla V100-SXM2][g] accelerator.

		![](img/gpu-v100.png)

		\| NVIDIA Tesla V100-SXM2 \| \|
		\| ---------------------------- \| -------------------------------------- \|
		\| GPU Architecture \| NVIDIA Volta \|
		\| NVIDIA Tensor Cores \| 640 \|
		\| NVIDIA CUDA® Cores \| 5120 \|
		\| Double-Precision Performance \| 7.8 TFLOP/s \|
		\| Single-Precision Performance \| 15.7 TFLOP/s \|
		\| Tensor Performance \| 125 TFLOP/s \|
		\| GPU Memory \| 16 GB HBM2 \|
		\| Memory Bandwidth \| 900 GB/sec \|
		\| ECC \| Yes \|
		\| Interconnect Bandwidth \| 300 GB/sec \|
		\| System Interface \| NVIDIA NVLink \|
		\| Form Factor \| SXM2 \|
		\| Max Power Consumption \| 300 W \|
		\| Thermal Solution \| Passive \|
		\| Compute APIs \| CUDA, DirectCompute, OpenCLTM, OpenACC \|

		[a]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)#Core
		[b]: https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Pipeline
		[c]: https://en.wikichip.org/wiki/x86/avx512vnni
		[d]: https://en.wikichip.org/wiki/intel/xeon_gold/6240
		[e]: https://en.wikichip.org/wiki/intel/xeon_gold/6126
		[f]: https://en.wikichip.org/wiki/intel/xeon_platinum/8153
		[g]: https://images.nvidia.com/content/technologies/volta/pdf/tesla-volta-v100-datasheet-letter-fnl-web.pdf

docs.it4i/barbora/hardware-overview.md

0 → 100644

+67 −0

Original line number	Diff line number	Diff line
		# Hardware Overview

		The Barbora cluster consists of 201 computational nodes named cn[001-201]
		of which 192 are regular compute nodes, 8 are GPU Tesla V100 accelerated nodes and 1 is a fat node.
		Each node is a powerful x86-64 computer, equipped with 36/24/128 cores
		(18-core Intel Cascade Lake 6240 / 12-core Intel Skylake Gold 6126 / 16-core Intel Skylake 8153), at least 192 GB of RAM.
		User access to the Barbora cluster is provided by two login nodes login[1,2].
		The nodes are interlinked through high speed InfiniBand and Ethernet networks.

		The fat node is equipped with 6144 GB of memory.
		Virtualization infrastructure provides resources for running long-term servers and services in virtual mode.
		The Accelerated nodes, fat node, and virtualization infrastructure are available [upon request][a] from a PI.

		There are three types of compute nodes:

		* 192 compute nodes without an accelerator
		* 8 compute nodes with a GPU accelerator - 4x NVIDIA Tesla V100-SXM2
		* 1 fat node - equipped with 6144 GB of RAM

		[More about compute nodes][1].

		GPU and accelerated nodes are available upon request, see the [Resources Allocation Policy][2].

		All of these nodes are interconnected through fast InfiniBand and Ethernet networks.
		[More about the computing network][3].
		Every chassis provides an InfiniBand switch, marked isw, connecting all nodes in the chassis,
		as well as connecting the chassis to the upper level switches.

		User access to Barbora is provided by two login nodes: login1 and login2.
		[More about accessing the cluster][5].

		The parameters are summarized in the following tables:

		\| In general \| \|
		\| ------------------------------------------- \| -------------------------------------------- \|
		\| Primary purpose \| High Performance Computing \|
		\| Architecture of compute nodes \| x86-64 \|
		\| Operating system \| Linux \|
		\| [Compute nodes][1] \| \|
		\| Total \| 201 \|
		\| Processor cores \| 36/24/128 (2x18 cores/2x12 cores/8x16 cores) \|
		\| RAM \| min. 192 GB \|
		\| Local disk drive \| no \|
		\| Compute network \| InfiniBand HDR \|
		\| w/o accelerator \| 192, cn[001-192] \|
		\| GPU accelerated \| 8, cn[193-200] \|
		\| Fat compute nodes \| 1, cn[201] \|
		\| In total \| \|
		\| Total theoretical peak performance (Rpeak) \| 848.8448 TFLOP/s \|
		\| Total amount of RAM \| 44.544 TB \|

		\| Node \| Processor \| Memory \| Accelerator \|
		\| ---------------- \| --------------------------------------- \| ------ \| ---------------------- \|
		\| Regular node \| 2x Intel Cascade Lake 6240, 2.6 GHz \| 192GB \| - \|
		\| GPU accelerated \| 2x Intel Skylake Gold 6126, 2.6 GHz \| 192GB \| NVIDIA Tesla V100-SXM2 \|
		\| Fat compute node \| 2x Intel Skylake Platinum 8153, 2.0 GHz \| 6144GB \| - \|

		For more details refer to the sections [Compute Nodes][1], [Storage][4], [Visualization Servers][6], and [Network][3].

		[1]: compute-nodes.md
		[2]: ../general/resources-allocation-policy.md
		[3]: network.md
		[4]: storage.md
		[5]: ../general/shell-and-data-access.md
		[6]: visualization.md

		[a]: https://support.it4i.cz/rt

docs.it4i/barbora/introduction.md

0 → 100644

+25 −0

Original line number	Diff line number	Diff line
		# Introduction

		Welcome to Barbora supercomputer cluster. The Barbora cluster consists of 201 compute nodes, totaling 7232 compute cores with 44544 GB RAM, giving over 848 TFLOP/s theoretical peak performance.

		Nodes are interconnected through a fully non-blocking fat-tree InfiniBand network, and are equipped with Intel Cascade Lake processors. A few nodes are also equipped with NVIDIA Tesla V100-SXM2. Read more in [Hardware Overview][1].

		The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains. These packages are accessible via the [modules environment][2].

		The user data shared file system and job data shared file system are available to users.

		The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

		Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].

		![](img/BullSequanaX.png)

		[1]: hardware-overview.md
		[2]: ../environment-and-modules.md
		[3]: ../general/resources-allocation-policy.md
		[4]: ../general/applying-for-resources.md
		[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
		[6]: ../general/shell-and-data-access.md

		[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
		[b]: https://slurm.schedmd.com/

docs.it4i/barbora/network.md

0 → 100644

+52 −0

Original line number	Diff line number	Diff line
		# Network

		All of the compute and login nodes of Barbora are interconnected through a [InfiniBand][a] HDR 200 Gbps network and a Gigabit Ethernet network.

		Compute nodes and the service infrastructure is connected by the HDR100 technology
		that allows one 200 Gbps HDR port (aggregation 4x 50 Gbps) to be divided into two HDR100 ports with 100 Gbps (2x 50 Gbps) bandwidth.

		The cabling between the L1 and L2 layer is realized by HDR cabling,
		connecting the end devices is realized by so called Y or splitter cable (1x HRD200 - 2x HDR100).

		![](img/hdr.jpg)

		The computing network thus implemented fulfills the following parameters

		* 100Gbps
		* Latencies less than 10 microseconds (0.6 μs end-to-end, <90ns switch hop)
		* Adaptive routing support
		* MPI communication support
		* IP protocol support (IPoIB)
		* Support for SCRATCH Data Storage and NVMe over Fabric Data Storage.

		## Mellanox QM8700 40-Ports Switch

		Performance

		* 40x HDR 200 Gb/s ports in a 1U switch
		* 80x HDR100 100 Gb/s ports in a 1U switch
		* 16 Tb/s aggregate switch throughput
		* Up to 15.8 billion messages-per-second
		* 90ns switch latency

		Optimized Design

		* 1+1 redundant & hot-swappable power
		* 80 gold+ and energy star certified power supplies
		* Dual-core x86 CPU

		Advanced Design

		* Adaptive routing
		* Collective offloads (Mellanox SHARP technology)
		* VL mapping (VL2VL)

		![](img/QM8700.jpg)

		## BullSequana XH2000 HDRx WH40 MODULE

		* Mellanox QM8700 switch modified for direct liquid cooling (Atos Cold Plate), with form factor for installing the Bull Sequana XH2000 rack

		![](img/XH2000.png)

		[a]: http://en.wikipedia.org/wiki/InfiniBand

docs.it4i/barbora/storage.md

0 → 100644

+224 −0

Original line number	Diff line number	Diff line
		# Storage

		There are three main shared file systems on the Barbora cluster: [HOME][1], [SCRATCH][2], and [PROJECT][5]. All login and compute nodes may access same data on shared file systems. Compute nodes are also equipped with local (non-shared) scratch, RAM disk, and tmp file systems.

		## Archiving

		Do not use shared filesystems as a backup for large amount of data or long-term archiving mean. The academic staff and students of research institutions in the Czech Republic can use [CESNET storage service][3], which is available via SSHFS.

		## Shared Filesystems

		Barbora computer provides three main shared filesystems, the [HOME filesystem][1], [SCRATCH filesystem][2], and the [PROJECT filesystems][5].

		All filesystems are accessible via the Infiniband network.

		The HOME and PROJECT filesystems are realized as NFS filesystem.

		The SCRATCH filesystem is realized as a parallel Lustre filesystem.

		Extended ACLs are provided on both Lustre filesystems for sharing data with other users using fine-grained control

		### Understanding the Lustre Filesystems

		A user file on the [Lustre filesystem][a] can be divided into multiple chunks (stripes) and stored across a subset of the object storage targets (OSTs) (disks). The stripes are distributed among the OSTs in a round-robin fashion to ensure load balancing.

		When a client (a compute node from your job) needs to create or access a file, the client queries the metadata server (MDS) and the metadata target (MDT) for the layout and location of the [file's stripes][b]. Once the file is opened and the client obtains the striping information, the MDS is no longer involved in the file I/O process. The client interacts directly with the object storage servers (OSSes) and OSTs to perform I/O operations such as locking, disk allocation, storage, and retrieval.

		If multiple clients try to read and write the same part of a file at the same time, the Lustre distributed lock manager enforces coherency, so that all clients see consistent results.

		There is default stripe configuration for Barbora Lustre filesystems. However, users can set the following stripe parameters for their own directories or files to get optimum I/O performance:

		1. `stripe_size` the size of the chunk in bytes; specify with k, m, or g to use units of KB, MB, or GB, respectively; the size must be an even multiple of 65,536 bytes; default is 1MB for all Barbora Lustre filesystems
		1. `stripe_count` the number of OSTs to stripe across; default is 1 for Barbora Lustre filesystems one can specify -1 to use all OSTs in the filesystem.
		1. `stripe_offset` the index of the OST where the first stripe is to be placed; default is -1 which results in random selection; using a non-default value is NOT recommended.

		!!! note
		Setting stripe size and stripe count correctly for your needs may significantly affect the I/O performance.

		Use the `lfs getstripe` command for getting the stripe parameters. Use `lfs setstripe` for setting the stripe parameters to get optimal I/O performance. The correct stripe setting depends on your needs and file access patterns.

		```console
		$ lfs getstripe dir\|filename
		$ lfs setstripe -s stripe_size -c stripe_count -o stripe_offset dir\|filename
		```

		Example:

		```console
		$ lfs getstripe /scratch/projname
		$ lfs setstripe -c -1 /scratch/projname
		$ lfs getstripe /scratch/projname
		```

		In this example, we view the current stripe setting of the /scratch/projname/ directory. The stripe count is changed to all OSTs and verified. All files written to this directory will be striped over 5 OSTs

		Use `lfs check osts` to see the number and status of active OSTs for each filesystem on Barbora. Learn more by reading the man page:

		```console
		$ lfs check osts
		$ man lfs
		```

		### Hints on Lustre Stripping

		!!! note
		Increase the `stripe_count` for parallel I/O to the same file.

		When multiple processes are writing blocks of data to the same file in parallel, the I/O performance for large files will improve when the `stripe_count` is set to a larger value. The stripe count sets the number of OSTs to which the file will be written. By default, the stripe count is set to 1. While this default setting provides for efficient access of metadata (for example to support the `ls -l` command), large files should use stripe counts of greater than 1. This will increase the aggregate I/O bandwidth by using multiple OSTs in parallel instead of just one. A rule of thumb is to use a stripe count approximately equal to the number of gigabytes in the file.

		Another good practice is to make the stripe count be an integral factor of the number of processes performing the write in parallel, so that you achieve load balance among the OSTs. For example, set the stripe count to 16 instead of 15 when you have 64 processes performing the writes.

		!!! note
		Using a large stripe size can improve performance when accessing very large files

		Large stripe size allows each client to have exclusive access to its own part of a file. However, it can be counterproductive in some cases if it does not match your I/O pattern. The choice of stripe size has no effect on a single-stripe file.

		Read more [here][c].

		### Lustre on Barbora

		The architecture of Lustre on Barbora is composed of two metadata servers (MDS) and two data/object storage servers (OSS).

		Configuration of the SCRATCH storage

		* 2x Metadata server
		* 2x Object storage server
		* Lustre object storage
		* One disk array NetApp E2800
		* 54x 8TB 10kRPM 2,5” SAS HDD
		* 5 x RAID6(8+2) OST Object storage target
		* 4 hotspare
		* Lustre metadata storage
		* One disk array NetApp E2600
		* 12 300GB SAS 15krpm disks
		* 2 groups of 5 disks in RAID5 Metadata target
		* 2 hot-spare disks

		### HOME File System

		The HOME filesystem is mounted in directory /home. Users home directories /home/username reside on this filesystem. Accessible capacity is 28TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 25GB per user. Should 25GB prove insufficient, contact [support][d], the quota may be lifted upon request.

		!!! note
		The HOME filesystem is intended for preparation, evaluation, processing and storage of data generated by active Projects.

		The HOME filesystem should not be used to archive data of past Projects or other unrelated data.

		The files on HOME filesystem will not be deleted until the end of the [user's lifecycle][4].

		The filesystem is backed up, so that it can be restored in case of a catastrophic failure resulting in significant data loss. However, this backup is not intended to restore old versions of user data or to restore (accidentally) deleted files.

		\| HOME filesystem \| \|
		\| -------------------- \| --------------- \|
		\| Accesspoint \| /home/username \|
		\| Capacity \| 28TB \|
		\| Throughput \| 1GB/s \|
		\| User space quota \| 25GB \|
		\| User inodes quota \| 500K \|
		\| Protocol \| NFS \|

		### SCRATCH File System

		The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.

		The SCRATCH filesystem is mounted in the `/scratch/project/PROJECT_ID` directory created automatically with the `PROJECT_ID` project. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.

		!!! note
		The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.

		Users are advised to save the necessary data from the SCRATCH filesystem to HOME filesystem after the calculations and clean up the scratch files.

		!!! warning
		Files on the SCRATCH filesystem that are not accessed for more than 90 days will be automatically deleted.

		The SCRATCH filesystem is realized as Lustre parallel filesystem and is available from all login and computational nodes. Default stripe size is 1MB, stripe count is 1. There are 5 OSTs dedicated for the SCRATCH filesystem.

		!!! note
		Setting stripe size and stripe count correctly for your needs may significantly affect the I/O performance.

		\| SCRATCH filesystem \| \|
		\| -------------------- \| --------- \|
		\| Mountpoint \| /scratch \|
		\| Capacity \| 310TB \|
		\| Throughput \| 5GB/s \|
		\| Throughput [Burst] \| 38GB/s \|
		\| User space quota \| 10TB \|
		\| User inodes quota \| 10M \|
		\| Default stripe size \| 1MB \|
		\| Default stripe count \| 1 \|
		\| Number of OSTs \| 5 \|

		### PROJECT File System

		The PROJECT data storage is a central storage for projects'/users' data on IT4Innovations that is accessible from all clusters.
		For more information, see the [PROJECT storage][6] section.

		### Disk Usage and Quota Commands

		Disk usage and user quotas can be checked and reviewed using the `it4ifsusage` command. You can see an example output [here][9].

		To have a better understanding of where the space is exactly used, you can use following command:

		```console
		$ du -hs dir
		```

		Example for your HOME directory:

		```console
		$ cd /home
		$ du -hs * .[a-zA-z0-9]* \| grep -E "[0-9]G\|[0-9]M" \| sort -hr
		258M cuda-samples
		15M .cache
		13M .mozilla
		5,5M .eclipse
		2,7M .idb_13.0_linux_intel64_app
		```

		This will list all directories with MegaBytes or GigaBytes of consumed space in your actual (in this example HOME) directory. List is sorted in descending order from largest to smallest files/directories.

		### Extended ACLs

		Extended ACLs provide another security mechanism beside the standard POSIX ACLs, which are defined by three entries (for owner/group/others). Extended ACLs have more than the three basic entries. In addition, they also contain a mask entry and may contain any number of named user and named group entries.

		ACLs on a Lustre file system work exactly like ACLs on any Linux file system. They are manipulated with the standard tools in the standard manner.

		For more information, see the [Access Control List][7] section of the documentation.

		## Local Filesystems

		### TMP

		Each node is equipped with local /tmp RAMDISK directory. The /tmp directory should be used to work with temporary files. Old files in /tmp directory are automatically purged.

		### SCRATCH and RAMDISK

		Each node is equipped with RAMDISK storage accessible at /tmp, /lscratch and /ramdisk. The RAMDISK capacity is 180GB. Data placed on RAMDISK occupies the node RAM memory of 192GB total. The RAMDISK directory should only be used to work with temporary files, where very high throughput or I/O performance is required. Old files in RAMDISK directory are automatically purged with job's end.

		#### Global RAM Disk

		The Global RAM disk spans the local RAM disks of all the allocated nodes within a single job.
		For more information, see the [Job Features][8] section.

		## Summary

		\| Mountpoint \| Usage \| Protocol \| Net Capacity \| Throughput \| Limitations \| Access \| Services \|
		\| ---------- \| ------------------------- \| -------- \| -------------- \| ------------------------------ \| ----------- \| ----------------------- \| ------------------------------- \|
		\| /home \| home directory \| NFS \| 28TB \| 1GB/s \| Quota 25GB \| Compute and login nodes \| backed up \|
		\| /scratch \| scratch temoporary \| Lustre \| 310TB \| 5GB/s, 30GB/s burst buffer \| Quota 10TB \| Compute and login nodes \|files older 90 days autoremoved \|
		\| /lscratch \| local scratch ramdisk \| tmpfs \| 180GB \| 130GB/s \| none \| Node local \| auto purged after job end \|

		[1]: #home-file-system
		[2]: #scratch-file-system
		[3]: ../storage/cesnet-storage.md
		[4]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
		[5]: #project-file-system
		[6]: ../storage/project-storage.md
		[7]: ../storage/standard-file-acl.md
		[8]: ../job-features.md#global-ram-disk
		[9]: ../storage/project-storage.md#project-quotas

		[a]: http://www.nas.nasa.gov
		[b]: http://www.nas.nasa.gov/hecc/support/kb/Lustre_Basics_224.html#striping
		[c]: http://doc.lustre.org/lustre_manual.xhtml#managingstripingfreespace
		[d]: https://support.it4i.cz/rt
		[e]: http://man7.org/linux/man-pages/man1/nfs4_setfacl.1.html

docs.it4i/barbora/visualization.md

0 → 100644

+50 −0

Original line number	Diff line number	Diff line
		# Visualization Servers

		Remote visualization with [VirtualGL][3] is available on two nodes.

		* 2 nodes
		* 32 cores in total
		* 2x Intel Skylake Gold 6130 – 16-core@2,1 GHz processors per node
		* 192 GB DDR4 2667 MT/s of physical memory per node (12x 16 GB)
		* BullSequana X450-E5 blade servers
		* 2150.4 GFLOP/s per compute node
		* 1x 1 GB Ethernet and 2x 10 GB Ethernet
		* 1x HDR100 IB port
		* 2x SSD 240 GB

		![](img/bullsequanaX450-E5.png)

		## NVIDIA Quadro P6000

		* GPU Memory: 24 GB GDDR5X
		* Memory Interface: 384-bit
		* Memory Bandwidth: Up to 432 GB/s
		* NVIDIA CUDA® Cores: 3840
		* System Interface: PCI Express 3.0 x16
		* Max Power Consumption: 250 W
		* Thermal Solution: Active
		* Form Factor: 4.4”H x 10.5” L, Dual Slot, Full Height
		* Display Connectors: 4x DP 1.4 + DVI-D DL
		* Max Simultaneous Displays: 4 direct, 4 DP1.4 Multi-Stream
		* Max DP 1.4 Resolution: 7680 x 4320 @ 30 Hz
		* Max DVI-D DL Resolution: 2560 x 1600 @ 60 Hz
		* Graphics APIs: Shader Model 5.1, OpenGL 4.5, DirectX 12.0, Vulkan 1.0,
		* Compute APIs: CUDA, DirectCompute, OpenCL™
		* Floating-Point Performance-Single Precision: 12.6 TFLOP/s, Peak

		![](img/quadrop6000.jpg)

		## Resource Allocation Policy

		\| queue \| active project \| project resources \| nodes \| min ncpus \| priority \| authorization \| walltime \|
		\|-------\|----------------\|-------------------\|-------\|-----------\|----------\|---------------\|----------\|
		\| qviz Visualization queue \| yes \| none required \| 2 \| 4 \| 150 \| no \| 1h/8h \|

		## References

		* [Graphical User Interface][1]
		* [VPN Access][2]

		[1]: ../general/shell-and-data-access.md#graphical-user-interface
		[2]: ../general/shell-and-data-access.md#vpn-access
		[3]: ../software/viz/vgl.md

docs.it4i/cloud/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/cloud/einfracz-cloud.md

0 → 100644

+76 −0

Original line number	Diff line number	Diff line
		# e-INFRA CZ Cloud Ostrava

		Ostrava cloud consists of 22 nodes from the [Karolina][a] supercomputer.
		The cloud site is built on top of OpenStack,
		which is a free open standard cloud computing platform.

		## Access

		To acces the cloud you must:

		* have an [e-Infra CZ account][3],
		* be a member of an [active project][b].

		The dashboard is available at [https://ostrava.openstack.cloud.e-infra.cz/][6].

		You can specify resources/quotas for your project.
		For more information, see the [Quota Limits][5] section.

		## Creating First Instance

		To create your first VM instance, follow the [e-INFRA CZ guide][4].
		Note that the guide is similar for clouds in Brno and Ostrava,
		so make sure that you follow steps for Ostrava cloud where applicable.

		### Process Automatization

		You can automate the process using Terraform or Openstack.

		#### Terraform

		Prerequisites:

		* Linux/Mac/WSL terminal BASH shell
		* installed Terraform and sshuttle
		* downloaded [application credentials][9] from OpenStack Horizon dashboard and saved as a `project_openrc.sh.inc` text file

		Follow the guide: [https://code.it4i.cz/terraform][8]

		#### OpenStack

		Prerequisites:

		* Linux/Mac/WSL terminal BASH shell
		* installed [OpenStack client][7]

		Follow the guide: [https://code.it4i.cz/commandline][10]

		Run commands:

		```console
		source project_openrc.sh.inc
		```

		```console
		./cmdline-demo.sh basic-infrastructure-1
		```

		## Technical Reference

		For the list of deployed OpenStack services, see the [list of components][1].

		More information can be found on the [e-INFRA CZ website][2].

		[1]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/openstack-components
		[2]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site
		[3]: https://docs.account.e-infra.cz/en/docs/access/account#how-to-apply-for-the-first-time
		[4]: https://docs.platforms.cloud.e-infra.cz/en/docs/getting-started/creating-first-infrastructure
		[5]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/quota-limits
		[6]: https://ostrava.openstack.cloud.e-infra.cz/
		[7]: https://cyso.cloud/docs/cloud/extra/how-to-use-the-openstack-cli-tools-on-linux/
		[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
		[9]: https://docs.platforms.cloud.e-infra.cz/en/docs/how-to-guides/obtaining-api-key
		[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline

		[a]: ../karolina/introduction.md
		[b]: ../general/access/project-access.md

docs.it4i/cloud/it4i-cloud.md

0 → 100644

+143 −0

Original line number	Diff line number	Diff line
		# IT4I Cloud

		IT4I cloud consists of 14 nodes from the [Karolina][a] supercomputer.
		The cloud site is built on top of OpenStack,
		which is a free open standard cloud computing platform.

		!!! Note
		The guide describes steps for personal projects.<br>
		Some steps may differ for large projects.<br>
		For large project, apply for resources to the [Allocation Committee][11].

		## Access

		To access the cloud you must be a member of an active EUROHPC project,
		or fall into the Access Category B, i.e. [Access For Thematic HPC Resource Utilisation][11].

		A personal OpenStack project is required. Request one by contacting [IT4I Support][12].

		The dashboard is available at [https://cloud.it4i.cz][6].

		You can see quotas set for the IT4I Cloud in the [Quota Limits][f] section.

		## Creating First Instance

		To create your first VM instance, follow the steps below:

		### Log In

		Go to [https://cloud.it4i.cz][6], enter your LDAP username and password and choose the `IT4I_LDAP` domain. After you sign in, you will be redirected to the dashboard.

		![](../img/login.png)

		### Create Key Pair

		SSH key is required for remote access to your instance.

		1. Go to Project > Compute > Key Pairs and click the Create Key Pair button.

		![](../img/keypairs.png)

		1. In the Create Key Pair window, name your key pair, select `SSH Key` for key type and confirm by clicking Create Key Pair.

		![](../img/keypairs1.png)

		1. Download and manage the private key according to your operating system.

		### Update Security Group

		To be able to remotely access your VM instance, you have to allow access in the security group.

		1. Go to Project > Network > Security Groups and click on Manage Rules, for the default security group.

		![](../img/securityg.png)

		1. Click on Add Rule, choose SSH, and leave the remaining fields unchanged.

		![](../img/securityg1.png)

		### Create VM Instance

		1. In Compute > Instances, click Launch Instance.

		![](../img/instance.png)

		1. Choose Instance Name, Description, and number of instances. Click Next.

		![](../img/instance1.png)

		1. Choose an image from which to boot the instance. Choose to delete the volume after instance delete. Click Next.

		![](../img/instance2.png)

		1. Choose the hardware resources of the instance by selecting a flavor. Additional volumes for data can be attached later on. Click Next.

		![](../img/instance3.png)

		1. Select the network and continue to Security Groups.

		![](../img/instance4.png)

		1. Allocate the security group with SSH rule that you added in the [Update Security Group](it4i-cloud.md#update-security-group) step. Then click Next to go to the Key Pair.

		![](../img/securityg2.png)

		1. Select the key that you created in the [Create Key Pair][g] section and launch the instance.

		![](../img/instance5.png)

		### Associate Floating IP

		1. Click on the Associate button next to the floating IP.

		![](../img/floatingip.png)

		1. Select Port to be associated with the instance, then click the Associate button.

		Now you can join the VM using your preferred SSH client.

		## Process Automatization

		You can automate the process using Openstack.

		### OpenStack

		Prerequisites:

		* Linux/Mac/WSL terminal BASH shell
		* installed [OpenStack client][7]

		Follow the guide: [https://code.it4i.cz/commandline][10]

		Run commands:

		```console
		source project_openrc.sh.inc
		```

		```console
		./cmdline-demo.sh basic-infrastructure-1
		```

		[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
		[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
		[3]: https://docs.e-infra.cz/account/
		[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
		[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-g2-site/quota-limits/
		[6]: https://cloud.it4i.cz
		[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
		[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
		[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
		[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
		[11]: https://www.it4i.cz/en/for-users/computing-resources-allocation
		[12]: mailto:support@it4i.cz @@

		[a]: ../karolina/introduction.md
		[b]: ../general/access/project-access.md
		[c]: einfracz-cloud.md
		[d]: ../general/accessing-the-clusters/vpn-access.md
		[e]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
		[f]: it4i-quotas.md
		[g]: it4i-cloud.md#create-key-pair

docs.it4i/cloud/it4i-quotas.md

0 → 100644

+31 −0

Original line number	Diff line number	Diff line
		# IT4I Cloud Quotas

		\| Resource \| Quota \|
		\|---------------------------------------\|-------\|
		\| Instances \| 10 \|
		\| VCPUs \| 20 \|
		\| RAM \| 32GB \|
		\| Volumes \| 20 \|
		\| Volume Snapshots \| 12 \|
		\| Volume Storage \| 500 \|
		\| Floating-IPs \| 1 \|
		\| Security Groups \| 10 \|
		\| Security Group Rules \| 100 \|
		\| Networks \| 1 \|
		\| Ports \| 10 \|
		\| Routers \| 1 \|
		\| Backups \| 12 \|
		\| Groups \| 10 \|
		\| rbac_policies \| 10 \|
		\| Subnets \| 1 \|
		\| Subnet_pools \| -1 \|
		\| Fixed-ips \| -1 \|
		\| Injected-file-size \| 10240 \|
		\| Injected-path-size \| 255 \|
		\| Injected-files \| 5 \|
		\| Key-pairs \| 100 \|
		\| Properties \| 128 \|
		\| Server-groups \| 10 \|
		\| Server-group-members \| 10 \|
		\| Backup-gigabytes \| 1002 \|
		\| Per-volume-gigabytes \| -1 \|

docs.it4i/config.yml

0 → 100644

+17 −0

Original line number	Diff line number	Diff line
		host: irods.it4i.cz
		port: 1247
		proxy_user: some_user
		client_user: some_user
		zone: IT4I

		authscheme: "pam"
		ssl_ca_cert_file: "~/.irods/chain_geant_ov_rsa_ca_4_full.pem"
		ssl_encryption_key_size: 32
		ssl_encryption_algorithm: "AES-256-CBC"
		ssl_encryption_salt_size: 8
		ssl_encryption_hash_rounds: 16

		path_mappings:
		- irods_path: /IT4I/home/some_user
		mapping_path: /
		resource_type: dir

docs.it4i/cs/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/cs/accessing.md

0 → 100644

+22 −0

Original line number	Diff line number	Diff line
		# Accessing Complementary Systems

		Complementary systems can be accessed at `login.cs.it4i.cz`
		by any user with an active account assigned to an active project.

		SSH is required to access Complementary systems.

		## Data Storage

		### Home

		The `/home` file system is shared across all Complementary systems. Note that this file system is not shared with the file system on IT4I clusters.

		### Scratch

		There are local `/lscratch` storages on individual nodes.

		### PROJECT

		Complementary systems are connected to the [PROJECT storage][1].

		[1]: ../storage/project-storage.md

docs.it4i/cs/guides/amd.md

0 → 100644

+558 −0

Original line number	Diff line number	Diff line
		# Using AMD Partition

		For testing your application on the AMD partition,
		you need to prepare a job script for that partition or use the interactive job:

		```console
		salloc -N 1 -c 64 -A PROJECT-ID -p p03-amd --gres=gpu:4 --time=08:00:00
		```

		where:

		- `-N 1` means allocating one server,
		- `-c 64` means allocating 64 cores,
		- `-A` is your project,
		- `-p p03-amd` is AMD partition,
		- `--gres=gpu:4` means allocating all 4 GPUs of the node,
		- `--time=08:00:00` means allocation for 8 hours.

		You have also an option to allocate subset of the resources only,
		by reducing the `-c` and `--gres=gpu` to smaller values.

		```console
		salloc -N 1 -c 48 -A PROJECT-ID -p p03-amd --gres=gpu:3 --time=08:00:00
		salloc -N 1 -c 32 -A PROJECT-ID -p p03-amd --gres=gpu:2 --time=08:00:00
		salloc -N 1 -c 16 -A PROJECT-ID -p p03-amd --gres=gpu:1 --time=08:00:00
		```

		!!! Note
		p03-amd01 server has hyperthreading enabled therefore htop shows 128 cores.<br>
		p03-amd02 server has hyperthreading disabled therefore htop shows 64 cores.

		## Using AMD MI100 GPUs

		The AMD GPUs can be programmed using the [ROCm open-source platform](https://docs.amd.com/).

		ROCm and related libraries are installed directly in the system.
		You can find it here:

		```console
		/opt/rocm/
		```

		The actual version can be found here:

		```console
		[user@p03-amd02.cs]$ cat /opt/rocm/.info/version

		5.5.1-74
		```

		## Basic HIP Code

		The first way how to program AMD GPUs is to use HIP.

		The basic vector addition code in HIP looks like this.
		This a full code and you can copy and paste it into a file.
		For this example we use `vector_add.hip.cpp`.

		```console
		#include <cstdio>
		#include <hip/hip_runtime.h>



		__global__ void add_vectors(float * x, float * y, float alpha, int count)
		{
		long long idx = blockIdx.x * blockDim.x + threadIdx.x;

		if(idx < count)
		y[idx] += alpha * x[idx];
		}

		int main()
		{
		// number of elements in the vectors
		long long count = 10;

		// allocation and initialization of data on the host (CPU memory)
		float * h_x = new float[count];
		float * h_y = new float[count];
		for(long long i = 0; i < count; i++)
		{
		h_x[i] = i;
		h_y[i] = 10 * i;
		}

		// print the input data
		printf("X:");
		for(long long i = 0; i < count; i++)
		printf(" %7.2f", h_x[i]);
		printf("\n");
		printf("Y:");
		for(long long i = 0; i < count; i++)
		printf(" %7.2f", h_y[i]);
		printf("\n");

		// allocation of memory on the GPU device
		float * d_x;
		float * d_y;
		hipMalloc(&d_x, count * sizeof(float));
		hipMalloc(&d_y, count * sizeof(float));

		// copy the data from host memory to the device
		hipMemcpy(d_x, h_x, count * sizeof(float), hipMemcpyHostToDevice);
		hipMemcpy(d_y, h_y, count * sizeof(float), hipMemcpyHostToDevice);

		int tpb = 256;
		int bpg = (count - 1) / tpb + 1;
		// launch the kernel on the GPU
		add_vectors<<< bpg, tpb >>>(d_x, d_y, 100, count);
		// hipLaunchKernelGGL(add_vectors, bpg, tpb, 0, 0, d_x, d_y, 100, count);

		// copy the result back to CPU memory
		hipMemcpy(h_y, d_y, count * sizeof(float), hipMemcpyDeviceToHost);

		// print the results
		printf("Y:");
		for(long long i = 0; i < count; i++)
		printf(" %7.2f", h_y[i]);
		printf("\n");

		// free the allocated memory
		hipFree(d_x);
		hipFree(d_y);
		delete[] h_x;
		delete[] h_y;

		return 0;
		}
		```

		To compile the code we use `hipcc` compiler.
		For compiler information, use `hipcc --version`:

		```console
		[user@p03-amd02.cs ~]$ hipcc --version

		HIP version: 5.5.30202-eaf00c0b
		AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.5.1 23194 69ef12a7c3cc5b0ccf820bc007bd87e8b3ac3037)
		Target: x86_64-unknown-linux-gnu
		Thread model: posix
		InstalledDir: /opt/rocm-5.5.1/llvm/bin
		```

		The code is compiled a follows:

		```console
		hipcc vector_add.hip.cpp -o vector_add.x
		```

		The correct output of the code is:

		```console
		[user@p03-amd02.cs ~]$ ./vector_add.x
		X: 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
		Y: 0.00 10.00 20.00 30.00 40.00 50.00 60.00 70.00 80.00 90.00
		Y: 0.00 110.00 220.00 330.00 440.00 550.00 660.00 770.00 880.00 990.00
		```

		More details on HIP programming is in the [HIP Programming Guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.5/page/Introduction_to_HIP_Programming_Guide.html)

		## HIP and ROCm Libraries

		The list of official AMD libraries can be found [here](https://docs.amd.com/category/libraries).

		The libraries are installed in the same directory is ROCm

		```console
		/opt/rocm/
		```

		Following libraries are installed:

		```console
		drwxr-xr-x 4 root root 44 Jun 7 14:09 hipblas
		drwxr-xr-x 3 root root 17 Jun 7 14:09 hipblas-clients
		drwxr-xr-x 3 root root 29 Jun 7 14:09 hipcub
		drwxr-xr-x 4 root root 44 Jun 7 14:09 hipfft
		drwxr-xr-x 3 root root 25 Jun 7 14:09 hipfort
		drwxr-xr-x 4 root root 32 Jun 7 14:09 hiprand
		drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsolver
		drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsparse
		```

		and

		```console
		drwxr-xr-x 4 root root 32 Jun 7 14:09 rocalution
		drwxr-xr-x 4 root root 44 Jun 7 14:09 rocblas
		drwxr-xr-x 4 root root 44 Jun 7 14:09 rocfft
		drwxr-xr-x 4 root root 32 Jun 7 14:09 rocprim
		drwxr-xr-x 4 root root 32 Jun 7 14:09 rocrand
		drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsolver
		drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsparse
		drwxr-xr-x 3 root root 29 Jun 7 14:09 rocthrust
		```

		## Using HipBlas Library

		The basic code in HIP that uses hipBlas looks like this.
		This a full code and you can copy and paste it into a file.
		For this example we use `hipblas.hip.cpp`.

		```console
		#include <cstdio>
		#include <vector>
		#include <cstdlib>
		#include <hip/hip_runtime.h>
		#include <hipblas/hipblas.h>


		int main()
		{
		srand(9600);

		int width = 10;
		int height = 7;
		int elem_count = width * height;


		// initialization of data in CPU memory

		float * h_A;
		hipHostMalloc(&h_A, elem_count * sizeof(*h_A));
		for(int i = 0; i < elem_count; i++)
		h_A[i] = (100.0f * rand()) / (float)RAND_MAX;
		printf("Matrix A:\n");
		for(int r = 0; r < height; r++)
		{
		for(int c = 0; c < width; c++)
		printf("%6.3f ", h_A[r + height * c]);
		printf("\n");
		}

		float * h_x;
		hipHostMalloc(&h_x, width * sizeof(*h_x));
		for(int i = 0; i < width; i++)
		h_x[i] = (100.0f * rand()) / (float)RAND_MAX;
		printf("vector x:\n");
		for(int i = 0; i < width; i++)
		printf("%6.3f ", h_x[i]);
		printf("\n");

		float * h_y;
		hipHostMalloc(&h_y, height * sizeof(*h_y));
		for(int i = 0; i < height; i++)
		h_x[i] = 100.0f + i;
		printf("vector y:\n");
		for(int i = 0; i < height; i++)
		printf("%6.3f ", h_x[i]);
		printf("\n");


		// initialization of data in GPU memory

		float * d_A;
		size_t pitch_A;
		hipMallocPitch((void*)&d_A, &pitch_A, height sizeof(*d_A), width);
		hipMemcpy2D(d_A, pitch_A, h_A, height * sizeof(d_A), height sizeof(*d_A), width, hipMemcpyHostToDevice);
		int lda = pitch_A / sizeof(float);

		float * d_x;
		hipMalloc(&d_x, width * sizeof(*d_x));
		hipMemcpy(d_x, h_x, width * sizeof(*d_x), hipMemcpyHostToDevice);

		float * d_y;
		hipMalloc(&d_y, height * sizeof(*d_y));
		hipMemcpy(d_y, h_y, height * sizeof(*d_y), hipMemcpyHostToDevice);


		// basic calculation of the result on the CPU

		float alpha=2.0f, beta=10.0f;

		for(int i = 0; i < height; i++)
		h_y[i] *= beta;
		for(int r = 0; r < height; r++)
		for(int c = 0; c < width; c++)
		h_y[r] += alpha * h_x[c] * h_A[r + height * c];
		printf("result y CPU:\n");
		for(int i = 0; i < height; i++)
		printf("%6.3f ", h_y[i]);
		printf("\n");


		// calculation of the result on the GPU using the hipBLAS library

		hipblasHandle_t blas_handle;
		hipblasCreate(&blas_handle);

		hipblasSgemv(blas_handle, HIPBLAS_OP_N, height, width, &alpha, d_A, lda, d_x, 1, &beta, d_y, 1);
		hipDeviceSynchronize();

		hipblasDestroy(blas_handle);


		// copy the GPU result to CPU memory and print it
		hipMemcpy(h_y, d_y, height * sizeof(*d_y), hipMemcpyDeviceToHost);
		printf("result y BLAS:\n");
		for(int i = 0; i < height; i++)
		printf("%6.3f ", h_y[i]);
		printf("\n");


		// free all the allocated memory
		hipFree(d_A);
		hipFree(d_x);
		hipFree(d_y);
		hipHostFree(h_A);
		hipHostFree(h_x);
		hipHostFree(h_y);

		return 0;
		}
		```

		The code compilation can be done as follows:

		```console
		hipcc hipblas.hip.cpp -o hipblas.x -lhipblas
		```

		## Using HipSolver Library

		The basic code in HIP that uses hipSolver looks like this.
		This a full code and you can copy and paste it into a file.
		For this example we use `hipsolver.hip.cpp`.

		```console
		#include <cstdio>
		#include <vector>
		#include <cstdlib>
		#include <algorithm>
		#include <hipsolver/hipsolver.h>
		#include <hipblas/hipblas.h>

		int main()
		{
		srand(63456);

		int size = 10;


		// allocation and initialization of data on host. this time we use std::vector

		int h_A_ld = size;
		int h_A_pitch = h_A_ld * sizeof(float);
		std::vector<float> h_A(size * h_A_ld);
		for(int r = 0; r < size; r++)
		for(int c = 0; c < size; c++)
		h_A[r * h_A_ld + c] = (10.0 * rand()) / RAND_MAX;
		printf("System matrix A:\n");
		for(int r = 0; r < size; r++)
		{
		for(int c = 0; c < size; c++)
		printf("%6.3f ", h_A[r * h_A_ld + c]);
		printf("\n");
		}

		std::vector<float> h_b(size);
		for(int i = 0; i < size; i++)
		h_b[i] = (10.0 * rand()) / RAND_MAX;
		printf("RHS vector b:\n");
		for(int i = 0; i < size; i++)
		printf("%6.3f ", h_b[i]);
		printf("\n");

		std::vector<float> h_x(size);


		// memory allocation on the device and initialization

		float * d_A;
		size_t d_A_pitch;
		hipMallocPitch((void**)&d_A, &d_A_pitch, size, size);
		int d_A_ld = d_A_pitch / sizeof(float);

		float * d_b;
		hipMalloc(&d_b, size * sizeof(float));

		float * d_x;
		hipMalloc(&d_x, size * sizeof(float));

		int * d_piv;
		hipMalloc(&d_piv, size * sizeof(int));

		int * info;
		hipMallocManaged(&info, sizeof(int));

		hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
		hipMemcpy(d_b, h_b.data(), size * sizeof(float), hipMemcpyHostToDevice);


		// solving the system using hipSOLVER

		hipsolverHandle_t solverHandle;
		hipsolverCreate(&solverHandle);

		int wss_trf, wss_trs; // wss = WorkSpace Size
		hipsolverSgetrf_bufferSize(solverHandle, size, size, d_A, d_A_ld, &wss_trf);
		hipsolverSgetrs_bufferSize(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, &wss_trs);
		float * workspace;
		int wss = std::max(wss_trf, wss_trs);
		hipMalloc(&workspace, wss * sizeof(float));

		hipsolverSgetrf(solverHandle, size, size, d_A, d_A_ld, workspace, wss, d_piv, info);
		hipsolverSgetrs(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, workspace, wss, info);

		hipMemcpy(d_x, d_b, size * sizeof(float), hipMemcpyDeviceToDevice);
		hipMemcpy(h_x.data(), d_x, size * sizeof(float), hipMemcpyDeviceToHost);
		printf("Solution vector x:\n");
		for(int i = 0; i < size; i++)
		printf("%6.3f ", h_x[i]);
		printf("\n");

		hipFree(workspace);

		hipsolverDestroy(solverHandle);


		// perform matrix-vector multiplication A*x using hipBLAS to check if the solution is correct

		hipblasHandle_t blasHandle;
		hipblasCreate(&blasHandle);

		float alpha = 1;
		float beta = 0;
		hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
		hipblasSgemv(blasHandle, HIPBLAS_OP_N, size, size, &alpha, d_A, d_A_ld, d_x, 1, &beta, d_b, 1);
		hipDeviceSynchronize();

		hipblasDestroy(blasHandle);

		for(int i = 0; i < size; i++)
		h_b[i] = 0;
		hipMemcpy(h_b.data(), d_b, size * sizeof(float), hipMemcpyDeviceToHost);
		printf("Check multiplication vector Ax:\n");
		for(int i = 0; i < size; i++)
		printf("%6.3f ", h_b[i]);
		printf("\n");


		// free all the allocated memory

		hipFree(info);
		hipFree(d_piv);
		hipFree(d_x);
		hipFree(d_b);
		hipFree(d_A);

		return 0;
		}
		```

		The code compilation can be done as follows:

		```console
		hipcc hipsolver.hip.cpp -o hipsolver.x -lhipblas -lhipsolver
		```

		## Using OpenMP Offload to Program AMD GPUs

		The ROCm™ installation includes an LLVM-based implementation that fully supports the OpenMP 4.5 standard
		and a subset of the OpenMP 5.0 standard.
		Fortran, C/C++ compilers, and corresponding runtime libraries are included.

		The OpenMP toolchain is automatically installed as part of the standard ROCm installation
		and is available under `/opt/rocm/llvm`. The sub-directories are:

		- `bin` : Compilers (flang and clang) and other binaries.
		- `examples` : The usage section below shows how to compile and run these programs.
		- `include` : Header files.
		- `lib` : Libraries including those required for target offload.
		- `lib-debug` : Debug versions of the above libraries.

		More information can be found in the [AMD OpenMP Support Guide](https://docs.amd.com/bundle/OpenMP-Support-Guide-v5.5/page/Introduction_to_OpenMP_Support_Guide.html).

		## Compilation of OpenMP Code

		Basic example that uses OpenMP offload is here.
		Again, code is complete and can be copied and pasted into a file.
		Here we use `vadd.cpp`.

		```console
		#include <cstdio>
		#include <cstdlib>

		int main(int argc, char ** argv)
		{
		long long count = 1 << 20;
		if(argc > 1)
		count = atoll(argv[1]);
		long long print_count = 16;
		if(argc > 2)
		print_count = atoll(argv[2]);

		long long * a = new long long[count];
		long long * b = new long long[count];
		long long * c = new long long[count];

		#pragma omp parallel for
		for(long long i = 0; i < count; i++)
		{
		a[i] = i;
		b[i] = 10 * i;
		}

		printf("A: ");
		for(long long i = 0; i < print_count; i++)
		printf("%3lld ", a[i]);
		printf("\n");

		printf("B: ");
		for(long long i = 0; i < print_count; i++)
		printf("%3lld ", b[i]);
		printf("\n");

		#pragma omp target map(to: a[0:count],b[0:count]) map(from: c[0:count])
		#pragma omp teams distribute parallel for
		for(long long i = 0; i < count; i++)
		{
		c[i] = a[i] + b[i];
		}

		printf("C: ");
		for(long long i = 0; i < print_count; i++)
		printf("%3lld ", c[i]);
		printf("\n");

		delete[] a;
		delete[] b;
		delete[] c;

		return 0;
		}
		```

		This code can be compiled like this:

		```console
		/opt/rocm/llvm/bin/clang++ -O3 -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 vadd.cpp -o vadd.x
		```

		These options are required for target offload from an OpenMP program:

		- `-target x86_64-pc-linux-gnu`
		- `-fopenmp`
		- `-fopenmp-targets=amdgcn-amd-amdhsa`
		- `-Xopenmp-target=amdgcn-amd-amdhsa`

		This flag specifies the GPU architecture of targeted GPU.
		You need to chage this when moving for instance to LUMI with MI250X GPU.
		The MI100 GPUs presented in CS have code `gfx908`:

		- `-march=gfx908`

		Note: You also have to include the `O0`, `O2`, `O3` or `O3` flag.
		Without this flag the execution of the compiled code fails.

docs.it4i/cs/guides/arm.md

0 → 100644

+49 −0

Original line number	Diff line number	Diff line
		# Using ARM Partition

		For testing your application on the ARM partition,
		you need to prepare a job script for that partition or use the interactive job:

		```
		salloc -A PROJECT-ID -p p01-arm
		```

		On the partition, you should reload the list of modules:

		```
		ml architecture/aarch64
		```

		For compilation, `gcc` and `OpenMPI` compilers are available.
		Hence, the compilation process should be the same as on the `x64` architecture.

		Let's have the following `hello world` example:

		```
		#include "mpi.h"
		#include "omp.h"

		int main(int argc, char **argv)
		{
		int rank;
		MPI_Init(&argc, &argv);
		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
		#pragma omp parallel
		{
		printf("Hello on rank %d, thread %d\n", rank, omp_get_thread_num());
		}
		MPI_Finalize();
		}
		```

		You can compile and run the example:

		```
		ml OpenMPI/4.1.4-GCC-11.3.0
		mpic++ -fopenmp hello.cpp -o hello
		mpirun -n 4 ./hello
		```

		Please see [gcc options](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html) for more advanced compilation settings.
		No complications are expected as long as the application does not use any intrinsic for `x64` architecture.
		If you want to use intrinsic,
		[SVE](https://developer.arm.com/documentation/102699/0100/Optimizing-with-intrinsics) instruction set is available.

docs.it4i/cs/guides/grace.md

0 → 100644

+301 −0

Original line number	Diff line number	Diff line
		# Using NVIDIA Grace Partition

		For testing your application on the NVIDIA Grace Partition,
		you need to prepare a job script for that partition or use the interactive job:

		```console
		salloc -N 1 -c 144 -A PROJECT-ID -p p11-grace --time=08:00:00
		```

		where:

		- `-N 1` means allocation single node,
		- `-c 144` means allocation 144 cores,
		- `-p p11-grace` is NVIDIA Grace partition,
		- `--time=08:00:00` means allocation for 8 hours.

		## Available Toolchains

		The platform offers three toolchains:

		- Standard GCC (as a module `ml GCC`)
		- [NVHPC](https://developer.nvidia.com/hpc-sdk) (as a module `ml NVHPC`)
		- [Clang for NVIDIA Grace](https://developer.nvidia.com/grace/clang) (installed in `/opt/nvidia/clang`)

		!!! note
		The NVHPC toolchain showed strong results with minimal amount of tuning necessary in our initial evaluation.

		### GCC Toolchain

		The GCC compiler seems to struggle with vectorization of short (constant length) loops, which tend to get completely unrolled/eliminated instead of being vectorized. For example simple nested loop such as

		```cpp
		for(int i = 0; i < 1000000; ++i) {
		// Iterations dependent in "i"
		// ...
		for(int j = 0; j < 8; ++j) {
		// but independent in "j"
		// ...
		}
		}
		```

		may emit scalar code for the inner loop leading to no vectorization being used at all.

		### Clang (For Grace) Toolchain

		The Clang/LLVM tends to behave similarly, but can be guided to properly vectorize the inner loop with either flags `-O3 -ffast-math -march=native -fno-unroll-loops -mllvm -force-vector-width=8` or pragmas such as `#pragma clang loop vectorize_width(8)` and `#pragma clang loop unroll(disable)`.

		```cpp
		for(int i = 0; i < 1000000; ++i) {
		// Iterations dependent in "i"
		// ...
		#pragma clang loop unroll(disable) vectorize_width(8)
		for(int j = 0; j < 8; ++j) {
		// but independent in "j"
		// ...
		}
		}
		```

		!!! note
		Our basic experiments show that fixed width vectorization (NEON) tends to perform better in the case of short (register-length) loops than SVE. In cases (like above), where specified `vectorize_width` is larger than availiable vector unit width, Clang will emit multiple NEON instructions (eg. 4 instructions will be emitted to process 8 64-bit operations in 128-bit units of Grace).

		### NVHPC Toolchain

		The NVHPC toolchain handled aforementioned case without any additional tuning. Simple `-O3 -march=native -fast` should be therefore sufficient.

		## Basic Math Libraries

		The basic libraries (BLAS and LAPACK) are included in NVHPC toolchain and can be used simply as `-lblas` and `-llapack` for BLAS and LAPACK respectively (`lp64` and `ilp64` versions are also included).

		!!! note
		The Grace platform doesn't include CUDA-capable GPU, therefore `nvcc` will fail with an error. This means that `nvc`, `nvc++` and `nvfortran` should be used instead.

		### NVIDIA Performance Libraries

		The [NVPL](https://developer.nvidia.com/nvpl) package includes more extensive set of libraries in both sequential and multi-threaded versions:

		- BLACS: `-lnvpl_blacs_{lp64,ilp64}_{mpich,openmpi3,openmpi4,openmpi5}`
		- BLAS: `-lnvpl_blas_{lp64,ilp64}_{seq,gomp}`
		- FFTW: `-lnvpl_fftw`
		- LAPACK: `-lnvpl_lapack_{lp64,ilp64}_{seq,gomp}`
		- ScaLAPACK: `-lnvpl_scalapack_{lp64,ilp64}`
		- RAND: `-lnvpl_rand` or `-lnvpl_rand_mt`
		- SPARSE: `-lnvpl_sparse`

		This package should be compatible with all availiable toolchains and includes CMake module files for easy integration into CMake-based projects. For further documentation see also [NVPL](https://docs.nvidia.com/nvpl).

		### Recommended BLAS Library

		We recommend to use the multi-threaded BLAS library from the NVPL package.

		!!! note
		It is important to pin the processes using OMP_PROC_BIND=spread

		Example:

		```console
		$ ml NVHPC
		$ nvc -O3 -march=native myprog.c -o myprog -lnvpl_blas_lp64_gomp
		$ OMP_PROC_BIND=spread ./myprog
		```

		## Basic Communication Libraries

		The OpenMPI 4 implementation is included with NVHPC toolchain and is exposed as a module (`ml OpenMPI`). The following example

		```cpp
		#include <mpi.h>
		#include <sched.h>
		#include <omp.h>

		int main(int argc, char **argv)
		{
		int rank;
		MPI_Init(&argc, &argv);
		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
		#pragma omp parallel
		{
		printf("Hello on rank %d, thread %d on CPU %d\n", rank, omp_get_thread_num(), sched_getcpu());
		}
		MPI_Finalize();
		}
		```

		can be compiled and run as follows

		```console
		ml OpenMPI
		mpic++ -fast -fopenmp hello.cpp -o hello
		OMP_PROC_BIND=close OMP_NUM_THREADS=4 mpirun -np 4 --map-by slot:pe=36 ./hello
		```

		In this configuration we run 4 ranks bound to one quarter of cores each with 4 OpenMP threads.

		## Simple BLAS Application

		The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

		Stationary probability vector estimation in `C++`:

		```cpp
		#include <iostream>
		#include <vector>
		#include <chrono>
		#include "cblas.h"

		const size_t ITERATIONS = 32;
		const size_t MATRIX_SIZE = 1024;

		int main(int argc, char *argv[])
		{
		const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

		std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

		for(size_t i = 0; i < MATRIX_SIZE; ++i)
		a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
		a[0] = 0.5f;

		std::vector<float> w1(matrixElements, 0.0f);
		std::vector<float> w2(matrixElements, 0.0f);

		std::copy(a.begin(), a.end(), w1.begin());

		std::vector<float> t1, t2;
		t1 = &w1;
		t2 = &w2;

		auto c1 = std::chrono::steady_clock::now();

		for(size_t i = 0; i < ITERATIONS; ++i)
		{
		std::fill(t2->begin(), t2->end(), 0.0f);

		cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
		1.0f, t1->data(), MATRIX_SIZE,
		a.data(), MATRIX_SIZE,
		1.0f, t2->data(), MATRIX_SIZE);

		std::swap(t1, t2);
		}

		auto c2 = std::chrono::steady_clock::now();

		for(size_t i = 0; i < MATRIX_SIZE; ++i)
		{
		std::cout << (t1)[iMATRIX_SIZE + i] << " ";
		}

		std::cout << std::endl;

		std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

		return 0;
		}
		```

		Stationary probability vector estimation in `Fortran`:

		```fortran
		program main
		implicit none

		integer :: matrix_size, iterations
		integer :: i
		real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
		real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
		real, pointer :: out_data(:), out_diag(:)
		integer :: cr, cm, c1, c2

		iterations = 32
		matrix_size = 1024

		call system_clock(count_rate=cr)
		call system_clock(count_max=cm)

		allocate(a(matrix_size, matrix_size))
		allocate(w1(matrix_size, matrix_size))
		allocate(w2(matrix_size, matrix_size))

		a(:,:) = 1.0 / real(matrix_size)
		a(:,1) = 0.5 / real(matrix_size - 1)
		a(1,1) = 0.5

		w1 = a
		w2(:,:) = 0.0

		t1 => w1
		t2 => w2

		call system_clock(c1)

		do i = 0, iterations
		t2(:,:) = 0.0

		call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

		tmp => t1
		t1 => t2
		t2 => tmp
		end do

		call system_clock(c2)

		out_data(1:size(t1)) => t1
		out_diag => out_data(1::matrix_size+1)

		print *, out_diag
		print *, "Elapsed Time: ", (c2 - c1) / real(cr)

		deallocate(a)
		deallocate(w1)
		deallocate(w2)
		end program main
		```

		### Using NVHPC Toolchain

		The C++ version of the example can be compiled with NVHPC and ran as follows

		```console
		ml NVHPC
		nvc++ -O3 -march=native -fast -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lblas main.cpp -o main
		OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
		```

		The Fortran version is just as simple:

		```console
		ml NVHPC
		nvfortran -O3 -march=native -fast -lblas main.f90 -o main.x
		OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
		```

		!!! note
		It may be advantageous to use NVPL libraries instead NVHPC ones. For example DGEMM BLAS 3 routine from NVPL is almost 30% faster than NVHPC one.

		### Using Clang (For Grace) Toolchain

		Similarly Clang for Grace toolchain with NVPL BLAS can be used to compile C++ version of the example.

		```console
		ml NVHPC
		/opt/nvidia/clang/17.23.11/bin/clang++ -O3 -march=native -ffast-math -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lnvpl_blas_lp64_gomp main.cpp -o main
		```

		!!! note
		NVHPC module is used just for the `cblas.h` include in this case. This can be avoided by changing the code to use `nvpl_blas.h` instead.

		## Additional Resources

		- [https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/][1]
		- [https://developer.nvidia.com/hpc-sdk][2]
		- [https://developer.nvidia.com/grace/clang][3]
		- [https://docs.nvidia.com/nvpl][4]

		[1]: https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/
		[2]: https://developer.nvidia.com/hpc-sdk
		[3]: https://developer.nvidia.com/grace/clang
		[4]: https://docs.nvidia.com/nvpl

docs.it4i/cs/guides/hm_management.md

0 → 100644

+280 −0

Original line number	Diff line number	Diff line
		# Heterogeneous Memory Management on Intel Platforms

		Partition `p10-intel` offser heterogeneous memory directly exposed to the user. This allows to manually pick appropriate kind of memory to be used at process or even single allocation granularity. Both kinds of memory are exposed as memory-only NUMA nodes. This allows both coarse (process level) and fine (allocation level) grained control over memory type used.

		## Overview

		At the process level the `numactl` facilities can be utilized, while Intel provided `memkind` library allows for finer control. Both `memkind` library and `numactl` can be accessed by loading `memkind` module or `OpenMPI` module (only `numactl`).

		```bash
		ml memkind
		```

		### Process Level (NUMACTL)

		The `numactl` allows to either restrict memory pool of the process to specific set of memory NUMA nodes

		```bash
		numactl --membind <node_ids_set>
		```

		or select single preffered node

		```bash
		numactl --preffered <node_id>
		```

		where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation.

		Convenient way to check `numactl` configuration is

		```bash
		numactl -s
		```

		which prints configuration in its execution environment eg.

		```bash
		numactl --membind 8-15 numactl -s
		policy: bind
		preferred node: 0
		physcpubind: 0 1 2 ... 189 190 191
		cpubind: 0 1 2 3 4 5 6 7
		nodebind: 0 1 2 3 4 5 6 7
		membind: 8 9 10 11 12 13 14 15
		```

		The last row shows allocations memory are restricted to NUMA nodes `8-15`.

		### Allocation Level (MEMKIND)

		The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing:

		```cpp
		void *pData = malloc(<SIZE>);
		/* ... */
		free(pData);
		```

		with

		```cpp
		#include <memkind.h>

		void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>);
		/* ... */
		memkind_free(NULL, pData); // "kind" parameter is deduced from the address
		```

		Similarly other memory types can be chosen.

		!!! note
		The allocation will return `NULL` pointer when memory of specified kind is not available.

		## High Bandwidth Memory (HBM)

		Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running

		```bash
		numactl -H
		```

		which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).

		![](../../img/cs/guides/p10_numa_sc4_flat.png)

		### Process Level

		With this we can easily restrict application to DDR DRAM or HBM memory:

		```bash
		# Only DDR DRAM
		numactl --membind 0-7 ./stream
		# ...
		Function Best Rate MB/s Avg time Min time Max time
		Copy: 369745.8 0.043355 0.043273 0.043588
		Scale: 366989.8 0.043869 0.043598 0.045355
		Add: 378054.0 0.063652 0.063483 0.063899
		Triad: 377852.5 0.063621 0.063517 0.063884

		# Only HBM
		numactl --membind 8-15 ./stream
		# ...
		Function Best Rate MB/s Avg time Min time Max time
		Copy: 1128430.1 0.015214 0.014179 0.015615
		Scale: 1045065.2 0.015814 0.015310 0.016309
		Add: 1096992.2 0.022619 0.021878 0.024182
		Triad: 1065152.4 0.023449 0.022532 0.024559
		```

		The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar.

		Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like

		```bash
		#!/bin/bash
		numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@
		```

		and can be used as

		```bash
		mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi
		```

		(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise

		```bash
		mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi
		```

		is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:
		![](../../img/cs/guides/p10_stream_dram.png)
		![](../../img/cs/guides/p10_stream_hbm.png)

		### Allocation Level

		Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows

		```cpp
		#include <memkind.h>
		// ...
		STREAM_TYPE a = (STREAM_TYPE )memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
		STREAM_TYPE b = (STREAM_TYPE )memkind_malloc(MEMKIND_REGULAR, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
		STREAM_TYPE c = (STREAM_TYPE )memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
		// ...
		memkind_free(NULL, a);
		memkind_free(NULL, b);
		memkind_free(NULL, c);
		```

		Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B.
		The code then has to be linked with `memkind` library

		```bash
		gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream
		```

		and can be run as

		```bash
		export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
		OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream
		```

		While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.

		![](../../img/cs/guides/p10_stream_memkind.png)

		With this setup we can see that simple copy operation (C[i] = A[i]) achieves bandwidth comparable to the application bound entirely to HBM memory. On the other hand the scale operation (B[i] = s*C[i]) is mostly limited by DDR DRAM bandwidth. Its also worth noting that operations combining all three arrays are performing close to HBM-only configuration.

		## Simple Application

		One of applications that can greatly benefit from availability of large slower and faster smaller memory is computing histogram with many bins over large dataset.

		```cpp
		#include <iostream>
		#include <vector>
		#include <chrono>
		#include <cmath>
		#include <cstring>
		#include <omp.h>
		#include <memkind.h>

		const size_t N_DATA_SIZE = 2 * 1024 * 1024 * 1024ull;
		const size_t N_BINS_COUNT = 1 * 1024 * 1024ull;
		const size_t N_ITERS = 10;

		#if defined(HBM)
		#define DATA_MEMKIND MEMKIND_REGULAR
		#define BINS_MEMKIND MEMKIND_HBW_ALL
		#else
		#define DATA_MEMKIND MEMKIND_REGULAR
		#define BINS_MEMKIND MEMKIND_REGULAR
		#endif

		int main(int argc, char *argv[])
		{
		const double binWidth = 1.0 / double(N_BINS_COUNT + 1);

		double pData = (double )memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double));
		size_t pBins = (size_t )memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double));

		#pragma omp parallel
		{
		drand48_data state;
		srand48_r(omp_get_thread_num(), &state);

		#pragma omp for
		for(size_t i = 0; i < N_DATA_SIZE; ++i)
		drand48_r(&state, &pData[i]);
		}

		auto c1 = std::chrono::steady_clock::now();

		for(size_t it = 0; it < N_ITERS; ++it)
		{
		#pragma omp parallel
		{
		for(size_t i = 0; i < N_BINS_COUNT; ++i)
		pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0);

		#pragma omp for
		for(size_t i = 0; i < N_DATA_SIZE; ++i)
		{
		const size_t idx = size_t(pData[i] / binWidth) % N_BINS_COUNT;
		pBins[omp_get_thread_num()*N_BINS_COUNT + idx]++;
		}
		}
		}

		auto c2 = std::chrono::steady_clock::now();

		#pragma omp parallel for
		for(size_t i = 0; i < N_BINS_COUNT; ++i)
		{
		for(size_t j = 1; j < omp_get_max_threads(); ++j)
		pBins[i] += pBins[j*N_BINS_COUNT + i];
		}

		std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

		size_t total = 0;
		#pragma omp parallel for reduction(+:total)
		for(size_t i = 0; i < N_BINS_COUNT; ++i)
		total += pBins[i];

		std::cout << "Total Items: " << total << std::endl;

		memkind_free(NULL, pData);
		memkind_free(NULL, pBins);

		return 0;
		}
		```

		### Using HBM Memory (P10-Intel)

		Following commands can be used to compile and run example application above

		```bash
		ml GCC memkind
		export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
		g++ -O3 -fopenmp -lmemkind histogram.cpp -o histogram_dram
		g++ -O3 -fopenmp -lmemkind -DHBM histogram.cpp -o histogram_hbm
		OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_dram
		OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_hbm
		```

		Moving histogram bins data into HBM memory should speedup the algorithm more than twice. It should be noted that moving also `pData` array into HBM memory worsens this result (presumably because the algorithm can saturate both memory interfaces).

		## Additional Resources

		- [https://linux.die.net/man/8/numactl][1]
		- [http://memkind.github.io/memkind/man_pages/memkind.html][2]
		- [https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory][3]

		[1]: https://linux.die.net/man/8/numactl
		[2]: http://memkind.github.io/memkind/man_pages/memkind.html
		[3]: https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory
		No newline at end of file

docs.it4i/cs/guides/horizon.md

0 → 100644

+79 −0

Original line number	Diff line number	Diff line
		# Using VMware Horizon

		VMware Horizon is a virtual desktop infrastructure (VDI) solution
		that enables users to access virtual desktops and applications from any device and any location.
		It provides a comprehensive end-to-end solution for managing and delivering virtual desktops and applications,
		including features such as session management, user authentication, and virtual desktop provisioning.

		![](../../img/horizon.png)

		## How to Access VMware Horizon

		!!! important
		Access to VMware Horizon requires IT4I VPN.

		1. Contact [IT4I support][a] with a request for an access and VM allocation.
		1. [Download][1] and install the VMware Horizon Client for Windows.
		1. Add a new server `https://vdi-cs01.msad.it4i.cz/` in the Horizon client.
		1. Connect to the server using your IT4I username and password.
		Username is in the `domain\username` format and the domain is `msad.it4i.cz`.
		For example: `msad.it4i.cz\user123`

		## Example

		Below is an example of how to mount a remote folder and check the conection on Windows OS:

		### Prerequsities

		3D applications

		* [Blender][3]

		SSHFS for remote access

		* [sshfs-win][4]
		* [winfsp][5]
		* [shfs-win-manager][6]
		* ssh keys for access to clusters

		### Steps

		1. Start the VPN and connect to the server via VMware Horizon Client.

		![](../../img/vmware.png)

		1. Mount a remote folder.
		* Run sshfs-win-manager.

		![](../../img/sshfs.png)

		* Add a new connection.

		![](../../img/sshfs1.png)

		* Click on Connect.

		![](../../img/sshfs2.png)

		1. Check that the folder is mounted.

		![](../../img/mount.png)

		1. Check the GPU resources.

		![](../../img/gpu.png)

		### Blender

		Now if you run, for example, Blender, you can check the available GPU resources in Blender Preferences.

		![](../../img/blender.png)

		[a]: mailto:support@it4i.cz

		[1]: https://vdi-cs01.msad.it4i.cz/
		[2]: https://www.paraview.org/download/
		[3]: https://www.blender.org/download/
		[4]: https://github.com/winfsp/sshfs-win/releases
		[5]: https://github.com/winfsp/winfsp/releases/
		[6]: https://github.com/evsar3/sshfs-win-manager/releases

docs.it4i/cs/guides/power10.md

0 → 100644

+227 −0

Original line number	Diff line number	Diff line
		# Using IBM Power Partition

		For testing your application on the IBM Power partition,
		you need to prepare a job script for that partition or use the interactive job:

		```console
		scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00
		```

		where:

		- `-N 1` means allocation single node,
		- `-c 192` means allocation 192 cores (threads),
		- `-p p07-power` is IBM Power partition,
		- `--time=08:00:00` means allocation for 8 hours.

		On the partition, you should reload the list of modules:

		```
		ml architecture/ppc64le
		```

		The platform offers both `GNU` based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library ([ESSL](https://www.ibm.com/docs/en/essl/6.1)), which can be used by both toolchain.

		## Building Applications

		Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use):

		```
		ml GCC OpenBLAS
		```

		### GCC Toolchain

		In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++`

		```
		g++ -lopenblas hello.cpp -o hello
		```

		or `gfortran`

		```
		gfortran -lopenblas hello.f90 -o hello
		```

		as usual.

		### IBM Toolchain

		The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module

		```
		IBM_ROOT=/opt/ibm
		OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
		OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1

		export PATH=$OPENXLC_ROOT/bin:$PATH
		export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH

		export PATH=$OPENXLF_ROOT/bin:$PATH
		export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
		```

		from there we can use either `ibm-clang++`

		```
		ibm-clang++ -lopenblas hello.cpp -o hello
		```

		or `xlf`

		```
		xlf -lopenblas hello.f90 -o hello
		```

		to build the application as usual.

		!!! note
		Combination of `xlf` and `openblas` seems to cause severe performance degradation. Therefore `ESSL` library should be preferred (see below).

		### Using ESSL Library

		The [ESSL](https://www.ibm.com/docs/en/essl/6.1) library is installed in `/opt/ibm/math/essl/7.1` so we define additional environment variables

		```
		IBM_ROOT=/opt/ibm
		ESSL_ROOT=${IBM_ROOT}math/essl/7.1
		export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
		```

		The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines).
		In practice this can look like

		```
		g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
		```

		or

		```
		gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
		```

		and similarly for IBM compilers (`ibm-clang++` and `xlf`).

		## Hello World Applications

		The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).

		Stationary probability vector estimation in `C++`:

		```c++
		#include <iostream>
		#include <vector>
		#include <chrono>
		#include "cblas.h"

		const size_t ITERATIONS = 32;
		const size_t MATRIX_SIZE = 1024;

		int main(int argc, char *argv[])
		{
		const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;

		std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));

		for(size_t i = 0; i < MATRIX_SIZE; ++i)
		a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
		a[0] = 0.5f;

		std::vector<float> w1(matrixElements, 0.0f);
		std::vector<float> w2(matrixElements, 0.0f);

		std::copy(a.begin(), a.end(), w1.begin());

		std::vector<float> t1, t2;
		t1 = &w1;
		t2 = &w2;

		auto c1 = std::chrono::steady_clock::now();

		for(size_t i = 0; i < ITERATIONS; ++i)
		{
		std::fill(t2->begin(), t2->end(), 0.0f);

		cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
		1.0f, t1->data(), MATRIX_SIZE,
		a.data(), MATRIX_SIZE,
		1.0f, t2->data(), MATRIX_SIZE);

		std::swap(t1, t2);
		}

		auto c2 = std::chrono::steady_clock::now();

		for(size_t i = 0; i < MATRIX_SIZE; ++i)
		{
		std::cout << (t1)[iMATRIX_SIZE + i] << " ";
		}

		std::cout << std::endl;

		std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;

		return 0;
		}
		```

		Stationary probability vector estimation in `Fortran`:

		```fortran
		program main
		implicit none

		integer :: matrix_size, iterations
		integer :: i
		real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
		real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
		real, pointer :: out_data(:), out_diag(:)
		integer :: cr, cm, c1, c2

		iterations = 32
		matrix_size = 1024

		call system_clock(count_rate=cr)
		call system_clock(count_max=cm)

		allocate(a(matrix_size, matrix_size))
		allocate(w1(matrix_size, matrix_size))
		allocate(w2(matrix_size, matrix_size))

		a(:,:) = 1.0 / real(matrix_size)
		a(:,1) = 0.5 / real(matrix_size - 1)
		a(1,1) = 0.5

		w1 = a
		w2(:,:) = 0.0

		t1 => w1
		t2 => w2

		call system_clock(c1)

		do i = 0, iterations
		t2(:,:) = 0.0

		call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)

		tmp => t1
		t1 => t2
		t2 => tmp
		end do

		call system_clock(c2)

		out_data(1:size(t1)) => t1
		out_diag => out_data(1::matrix_size+1)

		print *, out_diag
		print *, "Elapsed Time: ", (c2 - c1) / real(cr)

		deallocate(a)
		deallocate(w1)
		deallocate(w2)
		end program main
		```

docs.it4i/cs/guides/xilinx.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/cs/introduction.md

0 → 100644

+53 −0

Original line number	Diff line number	Diff line
		# Introduction

		Complementary systems offer development environment for users
		that need to port and optimize their code and applications
		for various hardware architectures and software technologies
		that are not available on standard clusters.

		## Complementary Systems 1

		First stage of complementary systems implementation comprises of these partitions:

		- compute partition 0 – based on ARM technology - legacy
		- compute partition 1 – based on ARM technology - A64FX
		- compute partition 2 – based on Intel technologies - Ice Lake, NVDIMMs + Bitware FPGAs
		- compute partition 3 – based on AMD technologies - Milan, MI100 GPUs + Xilinx FPGAs
		- compute partition 4 – reflecting Edge type of servers
		- partition 5 – FPGA synthesis server

		![](../img/cs1_1.png)

		## Complementary Systems 2

		Second stage of complementary systems implementation comprises of these partitions:

		- compute partition 6 - based on ARM technology + CUDA programmable GPGPU accelerators on ampere architecture + DPU network processing units
		- compute partition 7 - based on IBM Power10 architecture
		- compute partition 8 - modern CPU with a very high L3 cache capacity (over 750MB)
		- compute partition 9 - virtual GPU accelerated workstations
		- compute partition 10 - Sapphire Rapids-HBM server
		- compute partition 11 - NVIDIA Grace CPU Superchip

		![](../img/cs2_2.png)

		## Modules and Architecture Availability

		Complementary systems list available modules automatically based on the detected architecture.

		However, you can load one of the three modules -- `aarch64`, `avx2`, and `avx512` --
		to reload the list of modules available for the respective architecture:

		```console
		[user@login.cs ~]$ ml architecture/aarch64

		aarch64 modules + all modules

		[user@login.cs ~]$ ml architecture/avx2

		avx2 modules + all modules

		[user@login.cs ~]$ ml architecture/avx512

		avx512 modules + all modules
		```

docs.it4i/cs/job-scheduling.md

0 → 100644

+438 −0

Original line number	Diff line number	Diff line
		# Complementary System Job Scheduling

		## Introduction

		[Slurm][1] workload manager is used to allocate and access Complementary systems resources.

		## Getting Partition Information

		Display partitions/queues

		```console
		$ sinfo -s
		PARTITION AVAIL TIMELIMIT NODES(A/I/O/T) NODELIST
		p00-arm up 1-00:00:00 0/1/0/1 p00-arm01
		p01-arm* up 1-00:00:00 0/8/0/8 p01-arm[01-08]
		p02-intel up 1-00:00:00 0/2/0/2 p02-intel[01-02]
		p03-amd up 1-00:00:00 0/2/0/2 p03-amd[01-02]
		p04-edge up 1-00:00:00 0/1/0/1 p04-edge01
		p05-synt up 1-00:00:00 0/1/0/1 p05-synt01
		p06-arm up 1-00:00:00 0/2/0/2 p06-arm[01-02]
		p07-power up 1-00:00:00 0/1/0/1 p07-power01
		p08-amd up 1-00:00:00 0/1/0/1 p08-amd01
		p10-intel up 1-00:00:00 0/1/0/1 p10-intel01
		```

		## Getting Job Information

		Show jobs

		```console
		$ squeue --me
		JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
		104 p01-arm interact user R 1:48 2 p01-arm[01-02]
		```

		Show job details for specific job

		```console
		$ scontrol -d show job JOBID
		```

		Show job details for executing job from job session

		```console
		$ scontrol -d show job $SLURM_JOBID
		```

		## Running Interactive Jobs

		Run interactive job

		```console
		$ salloc -A PROJECT-ID -p p01-arm
		```

		Run interactive job, with X11 forwarding

		```console
		$ salloc -A PROJECT-ID -p p01-arm --x11
		```

		!!! warning
		Do not use `srun` for initiating interactive jobs, subsequent `srun`, `mpirun` invocations would block forever.

		## Running Batch Jobs

		Run batch job

		```console
		$ sbatch -A PROJECT-ID -p p01-arm ./script.sh
		```

		Useful command options (salloc, sbatch, srun)

		* -n, --ntasks
		* -c, --cpus-per-task
		* -N, --nodes

		## Slurm Job Environment Variables

		Slurm provides useful information to the job via environment variables. Environment variables are available on all nodes allocated to job when accessed via Slurm supported means (srun, compatible mpirun).

		See all Slurm variables

		```
		set \| grep ^SLURM
		```

		### Useful Variables

		\| variable name \| description \| example \|
		\| ------ \| ------ \| ------ \|
		\| SLURM_JOB_ID \| job id of the executing job\| 593 \|
		\| SLURM_JOB_NODELIST \| nodes allocated to the job \| p03-amd[01-02] \|
		\| SLURM_JOB_NUM_NODES \| number of nodes allocated to the job \| 2 \|
		\| SLURM_STEP_NODELIST \| nodes allocated to the job step \| p03-amd01 \|
		\| SLURM_STEP_NUM_NODES \| number of nodes allocated to the job step \| 1 \|
		\| SLURM_JOB_PARTITION \| name of the partition \| p03-amd \|
		\| SLURM_SUBMIT_DIR \| submit directory \| /scratch/project/open-xx-yy/work \|

		See [Slurm srun documentation][2] for details.

		Get job nodelist

		```
		$ echo $SLURM_JOB_NODELIST
		p03-amd[01-02]
		```

		Expand nodelist to list of nodes.

		```
		$ scontrol show hostnames $SLURM_JOB_NODELIST
		p03-amd01
		p03-amd02
		```

		## Modifying Jobs

		```
		$ scontrol update JobId=JOBID ATTR=VALUE
		```

		for example

		```
		$ scontrol update JobId=JOBID Comment='The best job ever'
		```

		## Deleting Jobs

		```
		$ scancel JOBID
		```

		## Partitions

		\| PARTITION \| nodes \| whole node \| cores per node \| features \|
		\| --------- \| ----- \| ---------- \| -------------- \| -------- \|
		\| p00-arm \| 1 \| yes \| 64 \| aarch64,cortex-a72 \|
		\| p01-arm \| 8 \| yes \| 48 \| aarch64,a64fx,ib \|
		\| p02-intel \| 2 \| no \| 64 \| x86_64,intel,icelake,ib,fpga,bitware,nvdimm \|
		\| p03-amd \| 2 \| no \| 64 \| x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx \|
		\| p04-edge \| 1 \| yes \| 16 \| 86_64,intel,broadwell,ib \|
		\| p05-synt \| 1 \| yes \| 8 \| x86_64,amd,milan,ib,ht \|
		\| p06-arm \| 2 \| yes \| 80 \| aarch64,ib \|
		\| p07-power \| 1 \| yes \| 192 \| ppc64le,ib \|
		\| p08-amd \| 1 \| yes \| 128 \| x86_64,amd,milan-x,ib,ht \|
		\| p10-intel \| 1 \| yes \| 96 \| x86_64,intel,sapphire_rapids,ht\|

		Use `-t`, `--time` option to specify job run time limit. Default job time limit is 2 hours, maximum job time limit is 24 hours.

		FIFO scheduling with backfilling is employed.

		## Partition 00 - ARM (Cortex-A72)

		Whole node allocation.

		One node:

		```console
		salloc -A PROJECT-ID -p p00-arm
		```

		## Partition 01 - ARM (A64FX)

		Whole node allocation.

		One node:

		```console
		salloc -A PROJECT-ID -p p01-arm
		```

		```console
		salloc -A PROJECT-ID -p p01-arm -N=1
		```

		Multiple nodes:

		```console
		salloc -A PROJECT-ID -p p01-arm -N=8
		```

		## Partition 02 - Intel (Ice Lake, NVDIMMs + Bitware FPGAs)

		FPGAs are treated as resources. See below for more details about resources.

		Partial allocation - per FPGA, resource separation is not enforced.
		Use only FPGAs allocated to the job!

		One FPGA:

		```console
		salloc -A PROJECT-ID -p p02-intel --gres=fpga
		```

		Two FPGAs on the same node:

		```console
		salloc -A PROJECT-ID -p p02-intel --gres=fpga:2
		```

		All FPGAs:

		```console
		salloc -A PROJECT-ID -p p02-intel -N 2 --gres=fpga:2
		```

		## Partition 03 - AMD (Milan, MI100 GPUs + Xilinx FPGAs)

		GPUs and FPGAs are treated as resources. See below for more details about resources.

		Partial allocation - per GPU and per FPGA, resource separation is not enforced.
		Use only GPUs and FPGAs allocated to the job!

		One GPU:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=gpu
		```

		Two GPUs on the same node:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=gpu:2
		```

		Four GPUs on the same node:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=gpu:4
		```

		All GPUs:

		```console
		salloc -A PROJECT-ID -p p03-amd -N 2 --gres=gpu:4
		```

		One FPGA:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=fpga
		```

		Two FPGAs:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=fpga:2
		```

		All FPGAs:

		```console
		salloc -A PROJECT-ID -p p03-amd -N 2--gres=fpga:2
		```

		One GPU and one FPGA on the same node:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=gpu,fpga
		```

		Four GPUs and two FPGAs on the same node:

		```console
		salloc -A PROJECT-ID -p p03-amd --gres=gpu:4,fpga:2
		```

		All GPUs and FPGAs:

		```console
		salloc -A PROJECT-ID -p p03-amd -N 2 --gres=gpu:4,fpga:2
		```

		## Partition 04 - Edge Server

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p04-edge
		```

		## Partition 05 - FPGA Synthesis Server

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p05-synt
		```

		## Partition 06 - ARM

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p06-arm
		```

		## Partition 07 - IBM Power

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p07-power
		```

		## Partition 08 - AMD Milan-X

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p08-amd
		```

		## Partition 10 - Intel Sapphire Rapids

		Whole node allocation:

		```console
		salloc -A PROJECT-ID -p p10-intel
		```

		## Features

		Nodes have feature tags assigned to them.
		Users can select nodes based on the feature tags using --constraint option.

		\| Feature \| Description \|
		\| ------ \| ------ \|
		\| aarch64 \| platform \|
		\| x86_64 \| platform \|
		\| ppc64le \| platform \|
		\| amd \| manufacturer \|
		\| intel \| manufacturer \|
		\| icelake \| processor family \|
		\| broadwell \| processor family \|
		\| sapphire_rapids \| processor family \|
		\| milan \| processor family \|
		\| milan-x \| processor family \|
		\| ib \| Infiniband \|
		\| gpu \| equipped with GPU \|
		\| fpga \| equipped with FPGA \|
		\| nvdimm \| equipped with NVDIMMs \|
		\| ht \| Hyperthreading enabled \|
		\| noht \| Hyperthreading disabled \|

		```
		$ sinfo -o '%16N %f'
		NODELIST AVAIL_FEATURES
		p00-arm01 aarch64,cortex-a72
		p01-arm[01-08] aarch64,a64fx,ib
		p02-intel01 x86_64,intel,icelake,ib,fpga,bitware,nvdimm,ht
		p02-intel02 x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
		p03-amd02 x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,noht
		p03-amd01 x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
		p04-edge01 x86_64,intel,broadwell,ib,ht
		p05-synt01 x86_64,amd,milan,ib,ht
		p06-arm[01-02] aarch64,ib
		p07-power01 ppc64le,ib
		p08-amd01 x86_64,amd,milan-x,ib,ht
		p10-intel01 x86_64,intel,sapphire_rapids,ht
		```

		```
		$ salloc -A PROJECT-ID -p p02-intel --constraint noht
		```

		```
		$ scontrol -d show node p02-intel02 \| grep ActiveFeatures
		ActiveFeatures=x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
		```

		## Resources, GRES

		Slurm supports the ability to define and schedule arbitrary resources - Generic RESources (GRES) in Slurm's terminology. We use GRES for scheduling/allocating GPUs and FPGAs.

		!!! warning
		Use only allocated GPUs and FPGAs. Resource separation is not enforced. If you use non-allocated resources, you can observe strange behavior and get into troubles.

		### Node Resources

		Get information about GRES on node.

		```
		$ scontrol -d show node p02-intel01 \| grep Gres=
		Gres=fpga:bitware_520n_mx:2
		$ scontrol -d show node p02-intel02 \| grep Gres=
		Gres=fpga:bitware_520n_mx:2
		$ scontrol -d show node p03-amd01 \| grep Gres=
		Gres=gpu:amd_mi100:4,fpga:xilinx_alveo_u250:2
		$ scontrol -d show node p03-amd02 \| grep Gres=
		Gres=gpu:amd_mi100:4,fpga:xilinx_alveo_u280:2
		```

		### Request Resources

		To allocate required resources (GPUs or FPGAs) use the `--gres salloc/srun` option.

		Example: Allocate one FPGA

		```
		$ salloc -A PROJECT-ID -p p03-amd --gres fpga:1
		```

		### Find Out Allocated Resources

		Information about allocated resources is available in Slurm job details, attributes `JOB_GRES` and `GRES`.

		```
		$ scontrol -d show job $SLURM_JOBID \|grep GRES=
		JOB_GRES=fpga:xilinx_alveo_u250:1
		Nodes=p03-amd01 CPU_IDs=0-1 Mem=0 GRES=fpga:xilinx_alveo_u250:1(IDX:0)
		```

		IDX in the GRES attribute specifies index/indexes of FPGA(s) (or GPUs) allocated to the job on the node. In the given example - allocated resources are `fpga:xilinx_alveo_u250:1(IDX:0)`, we should use FPGA with index/number 0 on node p03-amd01.

		### Request Specific Resources

		It is possible to allocate specific resources. It is useful for partition p03-amd equipped with FPGAs of different types.

		GRES entry is using format "name[[:type]:count", in the following example name is fpga, type is xilinx_alveo_u280, and count is count 2.

		```
		$ salloc -A PROJECT-ID -p p03-amd --gres=fpga:xilinx_alveo_u280:2
		salloc: Granted job allocation XXX
		salloc: Waiting for resource configuration
		salloc: Nodes p03-amd02 are ready for job

		$ scontrol -d show job $SLURM_JOBID \| grep -i gres
		JOB_GRES=fpga:xilinx_alveo_u280:2
		Nodes=p03-amd02 CPU_IDs=0 Mem=0 GRES=fpga:xilinx_alveo_u280(IDX:0-1)
		TresPerNode=gres:fpga:xilinx_alveo_u280:2
		```

		[1]: https://slurm.schedmd.com/
		[2]: https://slurm.schedmd.com/srun.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES

docs.it4i/cs/specifications.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/dgx2/accessing.md

0 → 100644

+36 −0

Original line number	Diff line number	Diff line
		# Accessing the DGX-2

		## Before You Access

		!!! warning
		GPUs are single-user devices. GPU memory is not purged between job runs and it can be read (but not written) by any user. Consider the confidentiality of your running jobs.

		## How to Access

		The DGX-2 machine is integrated into [Barbora cluster][3].
		The DGX-2 machine can be accessed from Barbora login nodes `barbora.it4i.cz` through the Barbora scheduler queue qdgx as a compute node cn202.

		## Storage

		There are three shared file systems on the DGX-2 system: HOME, SCRATCH (LSCRATCH), and PROJECT.

		### HOME

		The HOME filesystem is realized as an NFS filesystem. This is a shared home from the [Barbora cluster][1].

		### SCRATCH

		The SCRATCH is realized on an NVME storage. The SCRATCH filesystem is mounted in the `/scratch` directory.
		Accessible capacity is 22TB, shared among all users.

		!!! warning
		Files on the SCRATCH filesystem that are not accessed for more than 60 days will be automatically deleted.

		### PROJECT

		The PROJECT data storage is IT4Innovations' central data storage accessible from all clusters.
		For more information on accessing PROJECT, its quotas, etc., see the [PROJECT Data Storage][2] section.

		[1]: ../../barbora/storage/#home-file-system
		[2]: ../../storage/project-storage
		[3]: ../../barbora/introduction

docs.it4i/dgx2/introduction.md

0 → 100644

+44 −0

Original line number	Diff line number	Diff line
		# Introduction

		NVIDIA DGX-2 is a very powerful computational node, featuring high end x86_64 processors and 16 NVIDIA V100-SXM3 GPUs.

		\| NVIDIA DGX-2 \| \|
		\| --- \| --- \|
		\| CPUs \| 2 x Intel Xeon Platinum \|
		\| GPUs \| 16 x NVIDIA Tesla V100 32GB HBM2 \|
		\| System Memory \| Up to 1.5 TB DDR4 \|
		\| GPU Memory \| 512 GB HBM2 (16 x 32 GB) \|
		\| Storage \| 30 TB NVMe, Up to 60 TB \|
		\| Networking \| 8 x Infiniband or 8 x 100 GbE \|
		\| Power \| 10 kW \|
		\| Size \| 350 lbs \|
		\| GPU Throughput \| Tensor: 1920 TFLOPs, FP16: 520 TFLOPs, FP32: 260 TFLOPs, FP64: 130 TFLOPs \|

		The [DGX-2][a] introduces NVIDIA’s new NVSwitch, enabling 300 GB/s chip-to-chip communication at 12 times the speed of PCIe.

		With NVLink2, it enables 16x NVIDIA V100-SXM3 GPUs in a single system, for a total bandwidth going beyond 14 TB/s.
		Featuring pair of Xeon 8168 CPUs, 1.5 TB of memory, and 30 TB of NVMe storage,
		we get a system that consumes 10 kW, weighs 163.29 kg, but offers double precision performance in excess of 130TF.

		The DGX-2 is designed to be a powerful server in its own right.
		On the storage side, the DGX-2 comes with 30TB of NVMe-based solid state storage.
		For clustering or further inter-system communications, it also offers InfiniBand and 100GigE connectivity, up to eight of them.

		Further, the [DGX-2][b] offers a total of ~2 PFLOPs of half precision performance in a single system, when using the tensor cores.

		![](../img/dgx1.png)

		With DGX-2, AlexNET, the network that 'started' the latest machine learning revolution, now takes 18 minutes.

		The DGX-2 is able to complete the training process
		for FAIRSEQ – a neural network model for language translation – 10x faster than a DGX-1 system,
		bringing it down to less than two days total rather than 15 days.

		The new NVSwitches means that the PCIe lanes of the CPUs can be redirected elsewhere, most notably towards storage and networking connectivity.
		The topology of the DGX-2 means that all 16 GPUs are able to pool their memory into a unified memory space,
		though with the usual tradeoffs involved if going off-chip.

		![](../img/dgx2-nvlink.png)

		[a]: https://www.nvidia.com/content/dam/en-zz/es_em/Solutions/Data-Center/dgx-2/nvidia-dgx-2-datasheet.pdf
		[b]: https://www.youtube.com/embed/OTOGw0BRqK0

docs.it4i/dgx2/job_execution.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/dgx2/software.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/dice.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/einfracz-migration.md

0 → 100644

+47 −0

Original line number	Diff line number	Diff line
		# Migration to e-INFRA CZ

		## Introduction

		IT4Innovations is a part of [e-INFRA CZ][1] - strategic research infrastructure of the Czech Republic, which provides capacities and resources for the transmission, storage, and processing of scientific and research data. In January 2022, IT4I has begun the process of integration of its services.

		As a part of the process, a joint e-INFRA CZ user base has been established. This included a migration of eligible IT4I accounts.

		## Who Has Been Affected

		The migration affects all accounts of users affiliated with an academic organizations in the Czech Republic who also have an OPEN-XX-XX project. Affected users have received an email with information about changes in personal data processing.

		## Who Has Not Been Affected

		Commercial users, training accounts, suppliers, and service accounts were not affected by the migration.

		## Process

		During the process, additional steps have been required for successful migration.

		This may have included:

		1. e-INFRA CZ registration, if one does not already exist.
		2. e-INFRA CZ password reset, if one does not already exist.

		## Steps After Migration

		After the migration, you must use your e-INFRA CZ credentials to access all IT4I services as well as [e-INFRA CZ services][5].

		Successfully migrated accounts tied to e-INFRA CZ can be self-managed at [e-INFRA CZ User profile][4].

		!!! tip "Recommendation"
		We recommend [verifying your SSH keys][6] for cluster access.

		## Troubleshooting

		If you have a problem with your account migrated to e-INFRA CZ user base, contact the [CESNET support][7].

		If you have questions or a problem with IT4I account (i.e. account not eligible for migration), contact the [IT4I support][2].

		[1]: https://www.e-infra.cz/en
		[2]: mailto:support@it4i.cz
		[3]: https://www.cesnet.cz/?lang=en
		[4]: https://profile.e-infra.cz/
		[5]: https://www.e-infra.cz/en/services
		[6]: https://profile.e-infra.cz/profile/settings/sshKeys
		[7]: mailto:support@cesnet.cz

0 → 100644

+1 −0

Original line number	Diff line number	Diff line

docs.it4i/general/capacity.zip

0 → 100644

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/general/tools/cicd.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/general/tools/cli-client-tools.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/general/tools/codeit4i.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/general/tools/opencode.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/irods.cyberduckprofile

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/irods_environment.json

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/job-features.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/karolina/compute-nodes.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/hardware-overview.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/img/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/karolina/introduction.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/network.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/storage.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/karolina/visualization.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/lumi/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/src/srun_karolina.pdf

0 → 100644

+1.36 MiB

File added.

Preview size limit exceeded, changes collapsed.

View file

docs.it4i/storage/.gitkeep

0 → 100644

+0 −0

Original line number	Diff line number	Diff line

docs.it4i/storage/archive-storage.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/storage/awscli.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

docs.it4i/storage/cesnet-s3.md

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

Original line number	Diff line number	Diff line
		# IT4I Cloud Quotas

		\| Resource \| Quota \|
		\|---------------------------------------\|-------\|
		\| Instances \| 10 \|
		\| VCPUs \| 20 \|
		\| RAM \| 32GB \|
		\| Volumes \| 20 \|
		\| Volume Snapshots \| 12 \|
		\| Volume Storage \| 500 \|
		\| Floating-IPs \| 1 \|
		\| Security Groups \| 10 \|
		\| Security Group Rules \| 100 \|
		\| Networks \| 1 \|
		\| Ports \| 10 \|
		\| Routers \| 1 \|
		\| Backups \| 12 \|
		\| Groups \| 10 \|
		\| rbac_policies \| 10 \|
		\| Subnets \| 1 \|
		\| Subnet_pools \| -1 \|
		\| Fixed-ips \| -1 \|
		\| Injected-file-size \| 10240 \|
		\| Injected-path-size \| 255 \|
		\| Injected-files \| 5 \|
		\| Key-pairs \| 100 \|
		\| Properties \| 128 \|
		\| Server-groups \| 10 \|
		\| Server-group-members \| 10 \|
		\| Backup-gigabytes \| 1002 \|
		\| Per-volume-gigabytes \| -1 \|

Source

Target

Files

Some changes are not shown.