diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5928ace6dd55c58f8ead4e72e839d6216f45c8f5..18e00fcc53dc9091170b30339588df74972bf1c3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,7 +39,7 @@ ext_links: image: davidhrbac/docker-mdcheck:latest allow_failure: true after_script: - # remove JSON results + # remove JSON results - rm *.json script: #- find docs.it4i/ -name '*.md' -exec grep --color -l http {} + | xargs awesome_bot -t 10 @@ -63,7 +63,7 @@ mkdocs: #- apt-get -y install git # add version to footer - bash scripts/add_version.sh - # get modules list from clusters + # get modules list from clusters - bash scripts/get_modules.sh # regenerate modules matrix - python scripts/modules-matrix.py > docs.it4i/modules-matrix.md @@ -75,7 +75,7 @@ mkdocs: # replace broken links in 404.html - sed -i 's,href="" title=",href="/" title=",g' site/404.html # compress sitemap - - gzip < site/sitemap.xml > site/sitemap.xml.gz + - gzip < site/sitemap.xml > site/sitemap.xml.gz artifacts: paths: - site @@ -90,11 +90,11 @@ shellcheck: - find . -name *.sh -not -path "./docs.it4i/*" -not -path "./site/*" -exec shellcheck {} + deploy to stage: - environment: stage + environment: stage stage: deploy image: davidhrbac/docker-mkdocscheck:latest before_script: - # install ssh-agent + # install ssh-agent - 'which ssh-agent || ( apt-get update -y && apt-get install openssh-client -y )' - 'which rsync || ( apt-get update -y && apt-get install rsync -y )' # run ssh-agent @@ -117,7 +117,7 @@ deploy to production: stage: deploy image: davidhrbac/docker-mkdocscheck:latest before_script: - # install ssh-agent + # install ssh-agent - 'which ssh-agent || ( apt-get update -y && apt-get install openssh-client -y )' - 'which rsync || ( apt-get update -y && apt-get install rsync -y )' # run ssh-agent @@ -127,7 +127,7 @@ deploy to production: # disable host key checking (NOTE: makes you susceptible to man-in-the-middle attacks) # WARNING: use only in docker container, if you use it with shell you will overwrite your user's ssh config - mkdir -p ~/.ssh - - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config + - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config - useradd -lM nginx script: - chown nginx:nginx site -R diff --git a/README.md b/README.md index 78d4938aba704d2292ddb4e864a94f1435e90274..7ca5d57a12709555af7dac7b80f093118926f22d 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ Mellanox ### Formulas are made with: -* https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/ -* https://www.mathjax.org/ +* [https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/](https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/) +* [https://www.mathjax.org/](https://www.mathjax.org/) You can add formula to page like this: diff --git a/docs.it4i/anselm/resource-allocation-and-job-execution.md b/docs.it4i/anselm/resource-allocation-and-job-execution.md index 8df8072c9e5ddefbeba31c071309697ae1d6f92b..4585588dd18b1d308bd32c434dd2b09f50f9c154 100644 --- a/docs.it4i/anselm/resource-allocation-and-job-execution.md +++ b/docs.it4i/anselm/resource-allocation-and-job-execution.md @@ -1,6 +1,6 @@ # Resource Allocation and Job Execution -To run a [job](ob-submission-and-execution/), [computational resources](resources-allocation-policy/) for this particular job must be allocated. This is done via the PBS Pro job workload manager software, which efficiently distributes workloads across the supercomputer. Extensive information about PBS Pro can be found in the [official documentation here](../pbspro/), especially in the PBS Pro User's Guide. +To run a [job](job-submission-and-execution/), [computational resources](resources-allocation-policy/) for this particular job must be allocated. This is done via the PBS Pro job workload manager software, which efficiently distributes workloads across the supercomputer. Extensive information about PBS Pro can be found in the [official documentation here](../pbspro/), especially in the PBS Pro User's Guide. ## Resources Allocation Policy diff --git a/docs.it4i/anselm/resources-allocation-policy.md b/docs.it4i/anselm/resources-allocation-policy.md index 1ad88599387bc8c0d3dee992d7a8dcf2908c6df4..8519303bde96a0a8601e9a2d7df9c81aca430577 100644 --- a/docs.it4i/anselm/resources-allocation-policy.md +++ b/docs.it4i/anselm/resources-allocation-policy.md @@ -108,6 +108,4 @@ Options: ---8<--- "resource_accounting.md" - ---8<--- "mathjax.md" - diff --git a/docs.it4i/anselm/software/chemistry/molpro.md b/docs.it4i/anselm/software/chemistry/molpro.md deleted file mode 100644 index 9b08cb6ec8d2137e936f391eae4af97789d4f229..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/chemistry/molpro.md +++ /dev/null @@ -1,63 +0,0 @@ -# Molpro - -Molpro is a complete system of ab initio programs for molecular electronic structure calculations. - -## About Molpro - -Molpro is a software package used for accurate ab-initio quantum chemistry calculations. More information can be found at the [official webpage](http://www.molpro.net/). - -## License - -Molpro software package is available only to users that have a valid license. Please contact support to enable access to Molpro if you have a valid license appropriate for running on our cluster (eg. academic research group licence, parallel execution). - -To run Molpro, you need to have a valid license token present in " $HOME/.molpro/token". You can download the token from [Molpro website](https://www.molpro.net/licensee/?portal=licensee). - -## Installed Version - -Currently on Anselm is installed version 2010.1, patch level 45, parallel version compiled with Intel compilers and Intel MPI. - -Compilation parameters are default: - -| Parameter | Value | -| ---------------------------------- | ------------ | -| max number of atoms | 200 | -| max number of valence orbitals | 300 | -| max number of basis functions | 4095 | -| max number of states per symmmetry | 20 | -| max number of state symmetries | 16 | -| max number of records | 200 | -| max number of primitives | maxbfn x [2] | - -## Running - -Molpro is compiled for parallel execution using MPI and OpenMP. By default, Molpro reads the number of allocated nodes from PBS and launches a data server on one node. On the remaining allocated nodes, compute processes are launched, one process per node, each with 16 threads. You can modify this behavior by using -n, -t and helper-server options. Please refer to the [Molpro documentation](http://www.molpro.net/info/2010.1/doc/manual/node9.html) for more details. - -!!! note - The OpenMP parallelization in Molpro is limited and has been observed to produce limited scaling. We therefore recommend to use MPI parallelization only. This can be achieved by passing option mpiprocs=16:ompthreads=1 to PBS. - -You are advised to use the -d option to point to a directory in [SCRATCH file system](../../storage/storage/). Molpro can produce a large amount of temporary data during its run, and it is important that these are placed in the fast scratch file system. - -### Example jobscript - -```bash - #PBS -A IT4I-0-0 - #PBS -q qprod - #PBS -l select=1:ncpus=16:mpiprocs=16:ompthreads=1 - - cd $PBS_O_WORKDIR - - # load Molpro module - module add molpro - - # create a directory in the SCRATCH filesystem - mkdir -p /scratch/$USER/$PBS_JOBID - - # copy an example input - cp /apps/chem/molpro/2010.1/molprop_2010_1_Linux_x86_64_i8/examples/caffeine_opt_diis.com . - - # run Molpro with default options - molpro -d /scratch/$USER/$PBS_JOBID caffeine_opt_diis.com - - # delete scratch directory - rm -rf /scratch/$USER/$PBS_JOBID -``` diff --git a/docs.it4i/anselm/software/chemistry/nwchem.md b/docs.it4i/anselm/software/chemistry/nwchem.md deleted file mode 100644 index e4f84d49f9b8a38cba53f212d7db1bc6c8c8c7d2..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/chemistry/nwchem.md +++ /dev/null @@ -1,42 +0,0 @@ -# NWChem - -## Introduction - -NWChem aims to provide its users with computational chemistry tools that are scalable both in their ability to treat large scientific computational chemistry problems efficiently, and in their use of available parallel computing resources from high-performance parallel supercomputers to conventional workstation clusters. - -[Homepage](http://www.nwchem-sw.org/index.php/Main_Page) - -## Installed Versions - -The following versions are currently installed: - -* 6.1.1, not recommended, problems have been observed with this version -* 6.3-rev2-patch1, current release with QMD patch applied. Compiled with Intel compilers, MKL and Intel MPI -* 6.3-rev2-patch1-openmpi, same as above, but compiled with OpenMPI and NWChem provided BLAS instead of MKL. This version is expected to be slower -* 6.3-rev2-patch1-venus, this version contains only libraries for VENUS interface linking. Does not provide standalone NWChem executable - -For a current list of installed versions, execute: - -```console -$ ml av nwchem -``` - -## Running - -NWChem is compiled for parallel MPI execution. Normal procedure for MPI jobs applies. Sample jobscript: - -```bash - #PBS -A IT4I-0-0 - #PBS -q qprod - #PBS -l select=1:ncpus=16 - - module add nwchem/6.3-rev2-patch1 - mpirun -np 16 nwchem h2o.nw -``` - -## Options - -Please refer to [the documentation](http://www.nwchem-sw.org/index.php/Release62:Top-level) and in the input file set the following directives : - -* MEMORY : controls the amount of memory NWChem will use -* SCRATCH_DIR : set this to a directory in [SCRATCH file system](../../storage/storage/#scratch) (or run the calculation completely in a scratch directory). For certain calculations, it might be advisable to reduce I/O by forcing "direct" mode, e.g.. "scf direct" diff --git a/docs.it4i/anselm/software/compilers.md b/docs.it4i/anselm/software/compilers.md deleted file mode 100644 index 71e60499b1bb335ddb7a6919e22457aa70b68fa5..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/compilers.md +++ /dev/null @@ -1,156 +0,0 @@ -# Compilers - -## Available Compilers, Including GNU, INTEL, and UPC Compilers - -Currently there are several compilers for different programming languages available on the Anselm cluster: - -* C/C++ -* Fortran 77/90/95 -* Unified Parallel C -* Java -* NVIDIA CUDA - -The C/C++ and Fortran compilers are divided into two main groups GNU and Intel. - -## Intel Compilers - -For information about the usage of Intel Compilers and other Intel products, please read the [Intel Parallel studio](intel-suite/) page. - -## GNU C/C++ and Fortran Compilers - -For compatibility reasons there are still available the original (old 4.4.6-4) versions of GNU compilers as part of the OS. These are accessible in the search path by default. - -It is strongly recommended to use the up to date version (4.8.1) which comes with the module gcc: - -```console -$ ml gcc -$ gcc -v -$ g++ -v -$ gfortran -v -``` - -With the module loaded two environment variables are predefined. One for maximum optimizations on the Anselm cluster architecture, and the other for debugging purposes: - -```console -$ echo $OPTFLAGS - -O3 -march=corei7-avx -$ echo $DEBUGFLAGS - -O0 -g -``` - -For more information about the possibilities of the compilers, please see the man pages. - -## Unified Parallel C - - UPC is supported by two compiler/runtime implementations: - -* GNU - SMP/multi-threading support only -* Berkley - multi-node support as well as SMP/multi-threading support - -### GNU UPC Compiler - -To use the GNU UPC compiler and run the compiled binaries use the module gupc - -```console -$ module add gupc -$ gupc -v -$ g++ -v -``` - -Simple program to test the compiler - -```console -$ cat count.upc - - /* hello.upc - a simple UPC example */ - #include <upc.h> - #include <stdio.h> - - int main() { - if (MYTHREAD == 0) { - printf("Welcome to GNU UPC!!!n"); - } - upc_barrier; - printf(" - Hello from thread %in", MYTHREAD); - return 0; - } -``` - -To compile the example use - -```console -$ gupc -o count.upc.x count.upc -``` - -To run the example with 5 threads issue - -```console -$ ./count.upc.x -fupc-threads-5 -``` - -For more information see the man pages. - -### Berkley UPC Compiler - -To use the Berkley UPC compiler and runtime environment to run the binaries use the module bupc - -```console -$ module add bupc -$ upcc -version -``` - -As default UPC network the "smp" is used. This is very quick and easy way for testing/debugging, but limited to one node only. - -For production runs, it is recommended to use the native Infiband implementation of UPC network "ibv". For testing/debugging using multiple nodes, the "mpi" UPC network is recommended. - -!!! warning - Selection of the network is done at the compile time and not at runtime (as expected)! - -Example UPC code: - -```console -$ cat hello.upc - - /* hello.upc - a simple UPC example */ - #include <upc.h> - #include <stdio.h> - - int main() { - if (MYTHREAD == 0) { - printf("Welcome to Berkeley UPC!!!n"); - } - upc_barrier; - printf(" - Hello from thread %in", MYTHREAD); - return 0; - } -``` - -To compile the example with the "ibv" UPC network use - -```console -$ upcc -network=ibv -o hello.upc.x hello.upc -``` - -To run the example with 5 threads issue - -```console -$ upcrun -n 5 ./hello.upc.x -``` - -To run the example on two compute nodes using all 32 cores, with 32 threads, issue - -```console -$ qsub -I -q qprod -A PROJECT_ID -l select=2:ncpus=16 -$ module add bupc -$ upcrun -n 32 ./hello.upc.x -``` - -For more information see the man pages. - -## Java - -For information how to use Java (runtime and/or compiler), please read the [Java page](java/). - -## NVIDIA CUDA - -For information on how to work with NVIDIA CUDA, please read the [NVIDIA CUDA page](nvidia-cuda/). diff --git a/docs.it4i/anselm/software/comsol-multiphysics.md b/docs.it4i/anselm/software/comsol-multiphysics.md deleted file mode 100644 index 74672428542f3643d754768b7d3c44ed22f22cb6..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/comsol-multiphysics.md +++ /dev/null @@ -1,120 +0,0 @@ -# COMSOL Multiphysics - -## Introduction - -[COMSOL](http://www.comsol.com) is a powerful environment for modelling and solving various engineering and scientific problems based on partial differential equations. COMSOL is designed to solve coupled or multiphysics phenomena. For many -standard engineering problems COMSOL provides add-on products such as electrical, mechanical, fluid flow, and chemical -applications. - -* [Structural Mechanics Module](http://www.comsol.com/structural-mechanics-module), -* [Heat Transfer Module](http://www.comsol.com/heat-transfer-module), -* [CFD Module](http://www.comsol.com/cfd-module), -* [Acoustics Module](http://www.comsol.com/acoustics-module), -* and [many others](http://www.comsol.com/products) - -COMSOL also allows an interface support for equation-based modelling of partial differential equations. - -## Execution - -On the Anselm cluster COMSOL is available in the latest stable version. There are two variants of the release: - -* **Non commercial** or so called **EDU variant**, which can be used for research and educational purposes. -* **Commercial** or so called **COM variant**, which can used also for commercial activities. **COM variant** has only subset of features compared to the **EDU variant** available. More about licensing will be posted here soon. - -To load the of COMSOL load the module - -```console -$ ml comsol -``` - -By default the **EDU variant** will be loaded. If user needs other version or variant, load the particular version. To obtain the list of available versions use - -```console -$ ml av comsol -``` - -If user needs to prepare COMSOL jobs in the interactive mode it is recommend to use COMSOL on the compute nodes via PBS Pro scheduler. In order run the COMSOL Desktop GUI on Windows is recommended to use the Virtual Network Computing (VNC). - -```console -$ xhost + -$ qsub -I -X -A PROJECT_ID -q qprod -l select=1:ncpus=16 -$ ml comsol -$ comsol -``` - -To run COMSOL in batch mode, without the COMSOL Desktop GUI environment, user can utilized the default (comsol.pbs) job script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -l select=3:ncpus=16 -#PBS -q qprod -#PBS -N JOB_NAME -#PBS -A PROJECT_ID - -cd /scratch/$USER/ || exit - -echo Time is `date` -echo Directory is `pwd` -echo '**PBS_NODEFILE***START*******' -cat $PBS_NODEFILE -echo '**PBS_NODEFILE***END*********' - -text_nodes < cat $PBS_NODEFILE - -module load comsol -# module load comsol/43b-COM - -ntask=$(wc -l $PBS_NODEFILE) - -comsol -nn ${ntask} batch -configuration /tmp –mpiarg –rmk –mpiarg pbs -tmpdir /scratch/$USER/ -inputfile name_input_f.mph -outputfile name_output_f.mph -batchlog name_log_f.log -``` - -Working directory has to be created before sending the (comsol.pbs) job script into the queue. Input file (name_input_f.mph) has to be in working directory or full path to input file has to be specified. The appropriate path to the temp directory of the job has to be set by command option (-tmpdir). - -## LiveLink for MATLAB - -COMSOL is the software package for the numerical solution of the partial differential equations. LiveLink for MATLAB allows connection to the COMSOL API (Application Programming Interface) with the benefits of the programming language and computing environment of the MATLAB. - -LiveLink for MATLAB is available in both **EDU** and **COM** **variant** of the COMSOL release. On Anselm 1 commercial (**COM**) license and the 5 educational (**EDU**) licenses of LiveLink for MATLAB (please see the [ISV Licenses](isv_licenses/)) are available. -Following example shows how to start COMSOL model from MATLAB via LiveLink in the interactive mode. - -```console -$ xhost + -$ qsub -I -X -A PROJECT_ID -q qexp -l select=1:ncpus=16 -$ ml matlab -$ ml comsol -$ comsol server matlab -``` - -At the first time to launch the LiveLink for MATLAB (client-MATLAB/server-COMSOL connection) the login and password is requested and this information is not requested again. - -To run LiveLink for MATLAB in batch mode with (comsol_matlab.pbs) job script you can utilize/modify the following script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -l select=3:ncpus=16 -#PBS -q qprod -#PBS -N JOB_NAME -#PBS -A PROJECT_ID - -cd /scratch/$USER || exit - -echo Time is `date` -echo Directory is `pwd` -echo '**PBS_NODEFILE***START*******' -cat $PBS_NODEFILE -echo '**PBS_NODEFILE***END*********' - -text_nodes < cat $PBS_NODEFILE - -module load matlab -module load comsol/43b-EDU - -ntask=$(wc -l $PBS_NODEFILE) - -comsol -nn ${ntask} server -configuration /tmp -mpiarg -rmk -mpiarg pbs -tmpdir /scratch/$USER & -cd /apps/engineering/comsol/comsol43b/mli -matlab -nodesktop -nosplash -r "mphstart; addpath /scratch/$USER; test_job" -``` - -This example shows, how to run LiveLink for MATLAB with following configuration: 3 nodes and 16 cores per node. Working directory has to be created before submitting (comsol_matlab.pbs) job script into the queue. Input file (test_job.m) has to be in working directory or full path to input file has to be specified. The MATLAB command option (-r ”mphstart”) created a connection with a COMSOL server using the default port number. diff --git a/docs.it4i/anselm/software/debuggers/allinea-ddt.md b/docs.it4i/anselm/software/debuggers/allinea-ddt.md deleted file mode 100644 index f85848417002cc5c9f15d54ea437410ca4585f11..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/allinea-ddt.md +++ /dev/null @@ -1,94 +0,0 @@ -# Allinea Forge (DDT,MAP) - -Allinea Forge consist of two tools - debugger DDT and profiler MAP. - -Allinea DDT, is a commercial debugger primarily for debugging parallel MPI or OpenMP programs. It also has a support for GPU (CUDA) and Intel Xeon Phi accelerators. DDT provides all the standard debugging features (stack trace, breakpoints, watches, view variables, threads etc.) for every thread running as part of your program, or for every process - even if these processes are distributed across a cluster using an MPI implementation. - -Allinea MAP is a profiler for C/C++/Fortran HPC codes. It is designed for profiling parallel code, which uses pthreads, OpenMP or MPI. - -## License and Limitations for Anselm Users - -On Anselm users can debug OpenMP or MPI code that runs up to 64 parallel processes. In case of debugging GPU or Xeon Phi accelerated codes the limit is 8 accelerators. These limitation means that: - -* 1 user can debug up 64 processes, or -* 32 users can debug 2 processes, etc. - -In case of debugging on accelerators: - -* 1 user can debug on up to 8 accelerators, or -* 8 users can debug on single accelerator. - -## Compiling Code to Run With DDT - -### Modules - -Load all necessary modules to compile the code. For example: - -```console -$ ml intel -$ ml impi ... or ... module load openmpi/X.X.X-icc -``` - -Load the Allinea DDT module: - -```console -$ ml Forge -``` - -Compile the code: - -```console -$ mpicc -g -O0 -o test_debug test.c - -$ mpif90 -g -O0 -o test_debug test.f -``` - -### Compiler Flags - -Before debugging, you need to compile your code with theses flags: - -!!! note - \* **g** : Generates extra debugging information usable by GDB. -g3 includes even more debugging information. This option is available for GNU and INTEL C/C++ and Fortran compilers. - \* **O0** : Suppress all optimizations. - -## Starting a Job With DDT - -Be sure to log in with an X window forwarding enabled. This could mean using the -X in the ssh: - -```console -$ ssh -X username@anselm.it4i.cz -``` - -Other options is to access login node using VNC. Please see the detailed information on how to [use graphic user interface on Anselm](/general/accessing-the-clusters/graphical-user-interface/x-window-system/) - -From the login node an interactive session **with X windows forwarding** (-X option) can be started by following command: - -```console -$ qsub -I -X -A NONE-0-0 -q qexp -lselect=1:ncpus=16:mpiprocs=16,walltime=01:00:00 -``` - -Then launch the debugger with the ddt command followed by the name of the executable to debug: - -```console -$ ddt test_debug -``` - -A submission window that appears have a prefilled path to the executable to debug. You can select the number of MPI processors and/or OpenMP threads on which to run and press run. Command line arguments to a program can be entered to the "Arguments " box. - - - -To start the debugging directly without the submission window, user can specify the debugging and execution parameters from the command line. For example the number of MPI processes is set by option "-np 4". Skipping the dialog is done by "-start" option. To see the list of the "ddt" command line parameters, run "ddt --help". - -```console -ddt -start -np 4 ./hello_debug_impi -``` - -## Documentation - -Users can find original User Guide after loading the DDT module: - -```console -$DDTPATH/doc/userguide.pdf -``` - -[1] Discipline, Magic, Inspiration and Science: Best Practice Debugging with Allinea DDT, Workshop conducted at LLNL by Allinea on May 10, 2013, [link](https://computing.llnl.gov/tutorials/allineaDDT/index.html) diff --git a/docs.it4i/anselm/software/debuggers/allinea-performance-reports.md b/docs.it4i/anselm/software/debuggers/allinea-performance-reports.md deleted file mode 100644 index a5399a61e7ae133d4c037391a1123b0170a132ec..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/allinea-performance-reports.md +++ /dev/null @@ -1,59 +0,0 @@ -# Allinea Performance Reports - -## Introduction - -Allinea Performance Reports characterize the performance of HPC application runs. After executing your application through the tool, a synthetic HTML report is generated automatically, containing information about several metrics along with clear behavior statements and hints to help you improve the efficiency of your runs. - -The Allinea Performance Reports is most useful in profiling MPI programs. - -Our license is limited to 64 MPI processes. - -## Modules - -Allinea Performance Reports version 6.0 is available - -```console -$ ml PerformanceReports/6.0 -``` - -The module sets up environment variables, required for using the Allinea Performance Reports. This particular command loads the default module, which is performance reports version 4.2. - -## Usage - -!!! note - Use the the perf-report wrapper on your (MPI) program. - -Instead of [running your MPI program the usual way](../mpi/), use the the perf report wrapper: - -```console -$ perf-report mpirun ./mympiprog.x -``` - -The mpi program will run as usual. The perf-report creates two additional files, in \*.txt and \*.html format, containing the performance report. Note that [demanding MPI codes should be run within the queue system](../../job-submission-and-execution/). - -## Example - -In this example, we will be profiling the mympiprog.x MPI program, using Allinea performance reports. Assume that the code is compiled with Intel compilers and linked against Intel MPI library: - -First, we allocate some nodes via the express queue: - -```console -$ qsub -q qexp -l select=2:ncpus=16:mpiprocs=16:ompthreads=1 -I - qsub: waiting for job 262197.dm2 to start - qsub: job 262197.dm2 ready -``` - -Then we load the modules and run the program the usual way: - -```console -$ ml intel impi allinea-perf-report/4.2 -$ mpirun ./mympiprog.x -``` - -Now lets profile the code: - -```console -$ perf-report mpirun ./mympiprog.x -``` - -Performance report files [mympiprog_32p\*.txt](../../../src/mympiprog_32p_2014-10-15_16-56.txt) and [mympiprog_32p\*.html](../../../src/mympiprog_32p_2014-10-15_16-56.html) were created. We can see that the code is very efficient on MPI and is CPU bounded. diff --git a/docs.it4i/anselm/software/debuggers/debuggers.md b/docs.it4i/anselm/software/debuggers/debuggers.md deleted file mode 100644 index 3d38fd6a59565a1814df261d6cc2383f9bef7c59..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/debuggers.md +++ /dev/null @@ -1,60 +0,0 @@ -# Debuggers and profilers summary - -## Introduction - -We provide state of the art programms and tools to develop, profile and debug HPC codes at IT4Innovations. On these pages, we provide an overview of the profiling and debugging tools available on Anslem at IT4I. - -## Intel Debugger - -The intel debugger version 13.0 is available, via module intel. The debugger works for applications compiled with C and C++ compiler and the ifort fortran 77/90/95 compiler. The debugger provides java GUI environment. Use X display for running the GUI. - -```console -$ ml intel -$ idb -``` - -Read more at the [Intel Debugger](intel-suite/intel-debugger/) page. - -## Allinea Forge (DDT/MAP) - -Allinea DDT, is a commercial debugger primarily for debugging parallel MPI or OpenMP programs. It also has a support for GPU (CUDA) and Intel Xeon Phi accelerators. DDT provides all the standard debugging features (stack trace, breakpoints, watches, view variables, threads etc.) for every thread running as part of your program, or for every process even if these processes are distributed across a cluster using an MPI implementation. - -```console -$ ml Forge -$ forge -``` - -Read more at the [Allinea DDT](debuggers/allinea-ddt/) page. - -## Allinea Performance Reports - -Allinea Performance Reports characterize the performance of HPC application runs. After executing your application through the tool, a synthetic HTML report is generated automatically, containing information about several metrics along with clear behavior statements and hints to help you improve the efficiency of your runs. Our license is limited to 64 MPI processes. - -```console -$ ml PerformanceReports/6.0 -$ perf-report mpirun -n 64 ./my_application argument01 argument02 -``` - -Read more at the [Allinea Performance Reports](debuggers/allinea-performance-reports/) page. - -## RougeWave Totalview - -TotalView is a source- and machine-level debugger for multi-process, multi-threaded programs. Its wide range of tools provides ways to analyze, organize, and test programs, making it easy to isolate and identify problems in individual threads and processes in programs of great complexity. - -```console -$ ml totalview -$ totalview -``` - -Read more at the [Totalview](debuggers/total-view/) page. - -## Vampir Trace Analyzer - -Vampir is a GUI trace analyzer for traces in OTF format. - -```console -$ ml Vampir/8.5.0 -$ vampir -``` - -Read more at the [Vampir](vampir/) page. diff --git a/docs.it4i/anselm/software/debuggers/intel-vtune-amplifier.md b/docs.it4i/anselm/software/debuggers/intel-vtune-amplifier.md deleted file mode 100644 index 1d90aacfee0141246d4fbe41912ca8e3040b30db..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/intel-vtune-amplifier.md +++ /dev/null @@ -1,73 +0,0 @@ -# Intel VTune Amplifier - -## Introduction - -Intel VTune Amplifier, part of Intel Parallel studio, is a GUI profiling tool designed for Intel processors. It offers a graphical performance analysis of single core and multithreaded applications. A highlight of the features: - -* Hotspot analysis -* Locks and waits analysis -* Low level specific counters, such as branch analysis and memory - bandwidth -* Power usage analysis - frequency and sleep states. - - - -## Usage - -To launch the GUI, first load the module: - -```console -$ module add VTune/2016_update1 -``` - -and launch the GUI : - -```console -$ amplxe-gui -``` - -!!! note - To profile an application with VTune Amplifier, special kernel modules need to be loaded. The modules are not loaded on Anselm login nodes, thus direct profiling on login nodes is not possible. Use VTune on compute nodes and refer to the documentation on using GUI applications. - -The GUI will open in new window. Click on "_New Project..._" to create a new project. After clicking _OK_, a new window with project properties will appear. At "_Application:_", select the bath to your binary you want to profile (the binary should be compiled with -g flag). Some additional options such as command line arguments can be selected. At "_Managed code profiling mode:_" select "_Native_" (unless you want to profile managed mode .NET/Mono applications). After clicking _OK_, your project is created. - -To run a new analysis, click "_New analysis..._". You will see a list of possible analysis. Some of them will not be possible on the current CPU (e.g. Intel Atom analysis is not possible on Sandy Bridge CPU), the GUI will show an error box if you select the wrong analysis. For example, select "_Advanced Hotspots_". Clicking on _Start _will start profiling of the application. - -## Remote Analysis - -VTune Amplifier also allows a form of remote analysis. In this mode, data for analysis is collected from the command line without GUI, and the results are then loaded to GUI on another machine. This allows profiling without interactive graphical jobs. To perform a remote analysis, launch a GUI somewhere, open the new analysis window and then click the button "_Command line_" in bottom right corner. It will show the command line needed to perform the selected analysis. - -The command line will look like this: - -```console -$ /apps/all/VTune/2016_update1/vtune_amplifier_xe_2016.1.1.434111/bin64/amplxe-cl -collect advanced-hotspots -knob collection-detail=stack-and-callcount -mrte-mode=native -target-duration-type=veryshort -app-working-dir /home/sta545/test -- /home/sta545/test_pgsesv -``` - -Copy the line to clipboard and then you can paste it in your jobscript or in command line. After the collection is run, open the GUI once again, click the menu button in the upper right corner, and select "_Open > Result..._". The GUI will load the results from the run. - -## Xeon Phi - -!!! note - This section is outdated. It will be updated with new information soon. - -It is possible to analyze both native and offload Xeon Phi applications. For offload mode, just specify the path to the binary. For native mode, you need to specify in project properties: - -Application: ssh - -Application parameters: mic0 source ~/.profile && /path/to/your/bin - -Note that we include source ~/.profile in the command to setup environment paths [as described here](../intel-xeon-phi/). - -!!! note - If the analysis is interrupted or aborted, further analysis on the card might be impossible and you will get errors like "ERROR connecting to MIC card". In this case please contact our support to reboot the MIC card. - -You may also use remote analysis to collect data from the MIC and then analyze it in the GUI later : - -```console -$ amplxe-cl -collect knc-hotspots -no-auto-finalize -- ssh mic0 - "export LD_LIBRARY_PATH=/apps/intel/composer_xe_2015.2.164/compiler/lib/mic/:/apps/intel/composer_xe_2015.2.164/mkl/lib/mic/; export KMP_AFFINITY=compact; /tmp/app.mic" -``` - -## References - -1. <https://www.rcac.purdue.edu/tutorials/phi/PerformanceTuningXeonPhi-Tullos.pdf> Performance Tuning for Intel® Xeon Phi™ Coprocessors diff --git a/docs.it4i/anselm/software/debuggers/total-view.md b/docs.it4i/anselm/software/debuggers/total-view.md deleted file mode 100644 index de618ace58562f36720e41a5dbb603c9b2478c06..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/total-view.md +++ /dev/null @@ -1,158 +0,0 @@ -# Total View - -TotalView is a GUI-based source code multi-process, multi-thread debugger. - -## License and Limitations for Anselm Users - -On Anselm users can debug OpenMP or MPI code that runs up to 64 parallel processes. These limitation means that: - -```console - 1 user can debug up 64 processes, or - 32 users can debug 2 processes, etc. -``` - -Debugging of GPU accelerated codes is also supported. - -You can check the status of the licenses here: - -```console -$ cat /apps/user/licenses/totalview_features_state.txt - - # totalview - # ------------------------------------------------- - # FEATURE TOTAL USED AVAIL - # ------------------------------------------------- - TotalView_Team 64 0 64 - Replay 64 0 64 - CUDA 64 0 64 -``` - -## Compiling Code to Run With TotalView - -### Modules - -Load all necessary modules to compile the code. For example: - -```console -$ ml intel **or** ml foss -``` - -Load the TotalView module: - -```console -$ ml totalview/8.12 -``` - -Compile the code: - -```console -$ mpicc -g -O0 -o test_debug test.c -$ mpif90 -g -O0 -o test_debug test.f -``` - -### Compiler Flags - -Before debugging, you need to compile your code with theses flags: - -!!! note - \* **-g** : Generates extra debugging information usable by GDB. **-g3** includes even more debugging information. This option is available for GNU and INTEL C/C++ and Fortran compilers. - \* **-O0** : Suppress all optimizations. - -## Starting a Job With TotalView - -Be sure to log in with an X window forwarding enabled. This could mean using the -X in the ssh: - -```console -local $ ssh -X username@anselm.it4i.cz -``` - -Other options is to access login node using VNC. Please see the detailed information on how to use graphic user interface on Anselm. - -From the login node an interactive session with X windows forwarding (-X option) can be started by following command: - -```console -$ qsub -I -X -A NONE-0-0 -q qexp -lselect=1:ncpus=16:mpiprocs=16,walltime=01:00:00 -``` - -Then launch the debugger with the totalview command followed by the name of the executable to debug. - -### Debugging a Serial Code - -To debug a serial code use: - -```console -$ totalview test_debug -``` - -### Debugging a Parallel Code - Option 1 - -To debug a parallel code compiled with **OpenMPI** you need to setup your TotalView environment: - -!!! hint - To be able to run parallel debugging procedure from the command line without stopping the debugger in the mpiexec source code you have to add the following function to your `~/.tvdrc` file: - -```console - proc mpi_auto_run_starter {loaded_id} { - set starter_programs {mpirun mpiexec orterun} - set executable_name [TV::symbol get $loaded_id full_pathname] - set file_component [file tail $executable_name] - - if {[lsearch -exact $starter_programs $file_component] != -1} { - puts "*************************************" - puts "Automatically starting $file_component" - puts "*************************************" - dgo - } - } - - # Append this function to TotalView's image load callbacks so that - # TotalView run this program automatically. - - dlappend TV::image_load_callbacks mpi_auto_run_starter -``` - -The source code of this function can be also found in - -```console -$ /apps/mpi/openmpi/intel/1.6.5/etc/openmpi-totalview.tcl -``` - -!!! note - You can also add only following line to you ~/.tvdrc file instead of the entire function: - **source /apps/mpi/openmpi/intel/1.6.5/etc/openmpi-totalview.tcl** - -You need to do this step only once. - -Now you can run the parallel debugger using: - -```console -$ mpirun -tv -n 5 ./test_debug -``` - -When following dialog appears click on "Yes" - - - -At this point the main TotalView GUI window will appear and you can insert the breakpoints and start debugging: - - - -### Debugging a Parallel Code - Option 2 - -Other option to start new parallel debugging session from a command line is to let TotalView to execute mpirun by itself. In this case user has to specify a MPI implementation used to compile the source code. - -The following example shows how to start debugging session with Intel MPI: - -```console -$ ml intel -$ ml totalview -$ totalview -mpi "Intel MPI-Hydra" -np 8 ./hello_debug_impi -``` - -After running previous command you will see the same window as shown in the screenshot above. - -More information regarding the command line parameters of the TotalView can be found TotalView Reference Guide, Chapter 7: TotalView Command Syntax. - -## Documentation - -[1] The [TotalView documentation](http://www.roguewave.com/support/product-documentation/totalview-family.aspx#totalview) web page is a good resource for learning more about some of the advanced TotalView features. diff --git a/docs.it4i/anselm/software/debuggers/vampir.md b/docs.it4i/anselm/software/debuggers/vampir.md deleted file mode 100644 index 1dfa23e7b8eed6c9deaf04439df6b01ed6358480..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/debuggers/vampir.md +++ /dev/null @@ -1,22 +0,0 @@ -# Vampir - -Vampir is a commercial trace analysis and visualization tool. It can work with traces in OTF and OTF2 formats. It does not have the functionality to collect traces, you need to use a trace collection tool (such as [Score-P](score-p/)) first to collect the traces. - - - -## Installed Versions - -Version 8.5.0 is currently installed as module Vampir/8.5.0 : - -```console -$ ml Vampir/8.5.0 -$ vampir & -``` - -## User Manual - -You can find the detailed user manual in PDF format in $EBROOTVAMPIR/doc/vampir-manual.pdf - -## References - -[1]. <https://www.vampir.eu> diff --git a/docs.it4i/anselm/software/intel-suite/intel-compilers.md b/docs.it4i/anselm/software/intel-suite/intel-compilers.md deleted file mode 100644 index d446655d915833a139353d5c76015f70db9a9645..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/intel-compilers.md +++ /dev/null @@ -1,36 +0,0 @@ -# Intel Compilers - -The Intel compilers version 13.1.1 are available, via module intel. The compilers include the icc C and C++ compiler and the ifort fortran 77/90/95 compiler. - -```console -$ ml intel -$ icc -v -$ ifort -v -``` - -The intel compilers provide for vectorization of the code, via the AVX instructions and support threading parallelization via OpenMP - -For maximum performance on the Anselm cluster, compile your programs using the AVX instructions, with reporting where the vectorization was used. We recommend following compilation options for high performance - -```console -$ icc -ipo -O3 -vec -xAVX -vec-report1 myprog.c mysubroutines.c -o myprog.x -$ ifort -ipo -O3 -vec -xAVX -vec-report1 myprog.f mysubroutines.f -o myprog.x -``` - -In this example, we compile the program enabling interprocedural optimizations between source files (-ipo), aggressive loop optimizations (-O3) and vectorization (-vec -xAVX) - -The compiler recognizes the omp, simd, vector and ivdep pragmas for OpenMP parallelization and AVX vectorization. Enable the OpenMP parallelization by the **-openmp** compiler switch. - -```console -$ icc -ipo -O3 -vec -xAVX -vec-report1 -openmp myprog.c mysubroutines.c -o myprog.x -$ ifort -ipo -O3 -vec -xAVX -vec-report1 -openmp myprog.f mysubroutines.f -o myprog.x -``` - -Read more at <http://software.intel.com/sites/products/documentation/doclib/stdxe/2013/composerxe/compiler/cpp-lin/index.htm> - -## Sandy Bridge/Haswell Binary Compatibility - -Anselm nodes are currently equipped with Sandy Bridge CPUs, while Salomon will use Haswell architecture. >The new processors are backward compatible with the Sandy Bridge nodes, so all programs that ran on the Sandy Bridge processors, should also run on the new Haswell nodes. >To get optimal performance out of the Haswell processors a program should make use of the special AVX2 instructions for this processor. One can do this by recompiling codes with the compiler flags >designated to invoke these instructions. For the Intel compiler suite, there are two ways of doing this: - -* Using compiler flag (both for Fortran and C): -xCORE-AVX2. This will create a binary with AVX2 instructions, specifically for the Haswell processors. Note that the executable will not run on Sandy Bridge nodes. -* Using compiler flags (both for Fortran and C): -xAVX -axCORE-AVX2. This will generate multiple, feature specific auto-dispatch code paths for Intel® processors, if there is a performance benefit. So this binary will run both on Sandy Bridge and Haswell processors. During runtime it will be decided which path to follow, dependent on which processor you are running on. In general this will result in larger binaries. diff --git a/docs.it4i/anselm/software/intel-suite/intel-debugger.md b/docs.it4i/anselm/software/intel-suite/intel-debugger.md deleted file mode 100644 index d3a5807fca1a0051c4424a5613f3faa57c26895a..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/intel-debugger.md +++ /dev/null @@ -1,73 +0,0 @@ -# Intel Debugger - -## Debugging Serial Applications - -The intel debugger version 13.0 is available, via module intel. The debugger works for applications compiled with C and C++ compiler and the ifort fortran 77/90/95 compiler. The debugger provides java GUI environment. Use X display for running the GUI. - -```baconsolesh -$ ml intel -$ idb -``` - -The debugger may run in text mode. To debug in text mode, use - -```console -$ idbc -``` - -To debug on the compute nodes, module intel must be loaded. The GUI on compute nodes may be accessed using the same way as in the GUI section - -Example: - -```console -$ qsub -q qexp -l select=1:ncpus=16 -X -I - qsub: waiting for job 19654.srv11 to start - qsub: job 19654.srv11 ready - -$ ml intel -$ ml java -$ icc -O0 -g myprog.c -o myprog.x -$ idb ./myprog.x -``` - -In this example, we allocate 1 full compute node, compile program myprog.c with debugging options -O0 -g and run the idb debugger interactively on the myprog.x executable. The GUI access is via X11 port forwarding provided by the PBS workload manager. - -## Debugging Parallel Applications - -Intel debugger is capable of debugging multithreaded and MPI parallel programs as well. - -### Small Number of MPI Ranks - -For debugging small number of MPI ranks, you may execute and debug each rank in separate xterm terminal (do not forget the X display. Using Intel MPI, this may be done in following way: - -```console -$ qsub -q qexp -l select=2:ncpus=16 -X -I - qsub: waiting for job 19654.srv11 to start - qsub: job 19655.srv11 ready - -$ ml intel -$ mpirun -ppn 1 -hostfile $PBS_NODEFILE --enable-x xterm -e idbc ./mympiprog.x -``` - -In this example, we allocate 2 full compute node, run xterm on each node and start idb debugger in command line mode, debugging two ranks of mympiprog.x application. The xterm will pop up for each rank, with idb prompt ready. The example is not limited to use of Intel MPI - -### Large Number of MPI Ranks - -Run the idb debugger from within the MPI debug option. This will cause the debugger to bind to all ranks and provide aggregated outputs across the ranks, pausing execution automatically just after startup. You may then set break points and step the execution manually. Using Intel MPI: - -```console - $ qsub -q qexp -l select=2:ncpus=16 -X -I - qsub: waiting for job 19654.srv11 to start - qsub: job 19655.srv11 ready - -$ ml intel -$ mpirun -n 32 -idb ./mympiprog.x -``` - -### Debugging Multithreaded Application - -Run the idb debugger in GUI mode. The menu Parallel contains number of tools for debugging multiple threads. One of the most useful tools is the **Serialize Execution** tool, which serializes execution of concurrent threads for easy orientation and identification of concurrency related bugs. - -## Further Information - -Exhaustive manual on idb features and usage is published at [Intel website](http://software.intel.com/sites/products/documentation/doclib/stdxe/2013/composerxe/debugger/user_guide/index.htm) diff --git a/docs.it4i/anselm/software/intel-suite/intel-integrated-performance-primitives.md b/docs.it4i/anselm/software/intel-suite/intel-integrated-performance-primitives.md deleted file mode 100644 index 8e0451c69a082275e114c92acd223e3514317389..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/intel-integrated-performance-primitives.md +++ /dev/null @@ -1,81 +0,0 @@ -# Intel IPP - -## Intel Integrated Performance Primitives - -Intel Integrated Performance Primitives, version 7.1.1, compiled for AVX vector instructions is available, via module ipp. The IPP is a very rich library of highly optimized algorithmic building blocks for media and data applications. This includes signal, image and frame processing algorithms, such as FFT, FIR, Convolution, Optical Flow, Hough transform, Sum, MinMax, as well as cryptographic functions, linear algebra functions and many more. - -!!! note - Check out IPP before implementing own math functions for data processing, it is likely already there. - -```console -$ ml ipp -``` - -The module sets up environment variables, required for linking and running ipp enabled applications. - -## IPP Example - -```cpp - #include "ipp.h" - #include <stdio.h> - int main(int argc, char* argv[]) - { - const IppLibraryVersion *lib; - Ipp64u fm; - IppStatus status; - - status= ippInit(); //IPP initialization with the best optimization layer - if( status != ippStsNoErr ) { - printf("IppInit() Error:n"); - printf("%sn", ippGetStatusString(status) ); - return -1; - } - - //Get version info - lib = ippiGetLibVersion(); - printf("%s %sn", lib->Name, lib->Version); - - //Get CPU features enabled with selected library level - fm=ippGetEnabledCpuFeatures(); - printf("SSE :%cn",(fm>1)&1?'Y':'N'); - printf("SSE2 :%cn",(fm>2)&1?'Y':'N'); - printf("SSE3 :%cn",(fm>3)&1?'Y':'N'); - printf("SSSE3 :%cn",(fm>4)&1?'Y':'N'); - printf("SSE41 :%cn",(fm>6)&1?'Y':'N'); - printf("SSE42 :%cn",(fm>7)&1?'Y':'N'); - printf("AVX :%cn",(fm>8)&1 ?'Y':'N'); - printf("AVX2 :%cn", (fm>15)&1 ?'Y':'N' ); - printf("----------n"); - printf("OS Enabled AVX :%cn", (fm>9)&1 ?'Y':'N'); - printf("AES :%cn", (fm>10)&1?'Y':'N'); - printf("CLMUL :%cn", (fm>11)&1?'Y':'N'); - printf("RDRAND :%cn", (fm>13)&1?'Y':'N'); - printf("F16C :%cn", (fm>14)&1?'Y':'N'); - - return 0; - } -``` - -Compile above example, using any compiler and the ipp module. - -```console -$ ml intel -$ ml ipp - -$ icc testipp.c -o testipp.x -lippi -lipps -lippcore -``` - -You will need the ipp module loaded to run the ipp enabled executable. This may be avoided, by compiling library search paths into the executable - -```console -$ ml intel -$ ml ipp - -$ icc testipp.c -o testipp.x -Wl,-rpath=$LIBRARY_PATH -lippi -lipps -lippcore -``` - -## Code Samples and Documentation - -Intel provides number of [Code Samples for IPP](https://software.intel.com/en-us/articles/code-samples-for-intel-integrated-performance-primitives-library), illustrating use of IPP. - -Read full documentation on IPP [on Intel website,](http://software.intel.com/sites/products/search/search.php?q=&x=15&y=6&product=ipp&version=7.1&docos=lin) in particular the [IPP Reference manual.](http://software.intel.com/sites/products/documentation/doclib/ipp_sa/71/ipp_manual/index.htm) diff --git a/docs.it4i/anselm/software/intel-suite/intel-mkl.md b/docs.it4i/anselm/software/intel-suite/intel-mkl.md deleted file mode 100644 index 6594f8193b800fa1fb269b8611456c6311adafcf..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/intel-mkl.md +++ /dev/null @@ -1,114 +0,0 @@ -# Intel MKL - -## Intel Math Kernel Library - -Intel Math Kernel Library (Intel MKL) is a library of math kernel subroutines, extensively threaded and optimized for maximum performance. Intel MKL provides these basic math kernels: - -* BLAS (level 1, 2, and 3) and LAPACK linear algebra routines, offering vector, vector-matrix, and matrix-matrix operations. -* The PARDISO direct sparse solver, an iterative sparse solver, and supporting sparse BLAS (level 1, 2, and 3) routines for solving sparse systems of equations. -* ScaLAPACK distributed processing linear algebra routines for Linux and Windows operating systems, as well as the Basic Linear Algebra Communications Subprograms (BLACS) and the Parallel Basic Linear Algebra Subprograms (PBLAS). -* Fast Fourier transform (FFT) functions in one, two, or three dimensions with support for mixed radices (not limited to sizes that are powers of 2), as well as distributed versions of these functions. -* Vector Math Library (VML) routines for optimized mathematical operations on vectors. -* Vector Statistical Library (VSL) routines, which offer high-performance vectorized random number generators (RNG) for several probability distributions, convolution and correlation routines, and summary statistics functions. -* Data Fitting Library, which provides capabilities for spline-based approximation of functions, derivatives and integrals of functions, and search. -* Extended Eigensolver, a shared memory version of an eigensolver based on the Feast Eigenvalue Solver. - -For details see the [Intel MKL Reference Manual](http://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mklman/index.htm). - -Intel MKL is available on Anselm - -```console -$ ml imkl -``` - -The module sets up environment variables, required for linking and running mkl enabled applications. The most important variables are the $MKLROOT, $MKL_INC_DIR, $MKL_LIB_DIR and $MKL_EXAMPLES - -!!! note - The MKL library may be linked using any compiler. With intel compiler use -mkl option to link default threaded MKL. - -### Interfaces - -The MKL library provides number of interfaces. The fundamental once are the LP64 and ILP64. The Intel MKL ILP64 libraries use the 64-bit integer type (necessary for indexing large arrays, with more than 231^-1 elements), whereas the LP64 libraries index arrays with the 32-bit integer type. - -| Interface | Integer type | -| --------- | -------------------------------------------- | -| LP64 | 32-bit, int, integer(kind=4), MPI_INT | -| ILP64 | 64-bit, long int, integer(kind=8), MPI_INT64 | - -### Linking - -Linking MKL libraries may be complex. Intel [mkl link line advisor](http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor) helps. See also [examples](intel-mkl/#examples) below. - -You will need the mkl module loaded to run the mkl enabled executable. This may be avoided, by compiling library search paths into the executable. Include rpath on the compile line: - -```console -$ icc .... -Wl,-rpath=$LIBRARY_PATH ... -``` - -### Threading - -!!! note - Advantage in using the MKL library is that it brings threaded parallelization to applications that are otherwise not parallel. - -For this to work, the application must link the threaded MKL library (default). Number and behaviour of MKL threads may be controlled via the OpenMP environment variables, such as OMP_NUM_THREADS and KMP_AFFINITY. MKL_NUM_THREADS takes precedence over OMP_NUM_THREADS - -```console -$ export OMP_NUM_THREADS=16 -$ export KMP_AFFINITY=granularity=fine,compact,1,0 -``` - -The application will run with 16 threads with affinity optimized for fine grain parallelization. - -## Examples - -Number of examples, demonstrating use of the MKL library and its linking is available on Anselm, in the $MKL_EXAMPLES directory. In the examples below, we demonstrate linking MKL to Intel and GNU compiled program for multi-threaded matrix multiplication. - -### Working With Examples - -```console -$ ml intel -$ cp -a $MKL_EXAMPLES/cblas /tmp/ -$ cd /tmp/cblas -$ make sointel64 function=cblas_dgemm -``` - -In this example, we compile, link and run the cblas_dgemm example, demonstrating use of MKL example suite installed on Anselm. - -### Example: MKL and Intel Compiler - -```console -$ ml intel -$ cp -a $MKL_EXAMPLES/cblas /tmp/ -$ cd /tmp/cblas -$ icc -w source/cblas_dgemmx.c source/common_func.c -mkl -o cblas_dgemmx.x -$ ./cblas_dgemmx.x data/cblas_dgemmx.d -``` - -In this example, we compile, link and run the cblas_dgemm example, demonstrating use of MKL with icc -mkl option. Using the -mkl option is equivalent to: - -```console -$ icc -w source/cblas_dgemmx.c source/common_func.c -o cblas_dgemmx.x -I$MKL_INC_DIR -L$MKL_LIB_DIR -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -``` - -In this example, we compile and link the cblas_dgemm example, using LP64 interface to threaded MKL and Intel OMP threads implementation. - -### Example: MKL and GNU Compiler - -```console -$ ml gcc -$ ml imkl -$ cp -a $MKL_EXAMPLES/cblas /tmp/ -$ cd /tmp/cblas -$ gcc -w source/cblas_dgemmx.c source/common_func.c -o cblas_dgemmx.x -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lm -$ ./cblas_dgemmx.x data/cblas_dgemmx.d -``` - -In this example, we compile, link and run the cblas_dgemm example, using LP64 interface to threaded MKL and gnu OMP threads implementation. - -## MKL and MIC Accelerators - -The MKL is capable to automatically offload the computations o the MIC accelerator. See section [Intel XeonPhi](../intel-xeon-phi/) for details. - -## Further Reading - -Read more on [Intel website](http://software.intel.com/en-us/intel-mkl), in particular the [MKL users guide](https://software.intel.com/en-us/intel-mkl/documentation/linux). diff --git a/docs.it4i/anselm/software/intel-suite/intel-tbb.md b/docs.it4i/anselm/software/intel-suite/intel-tbb.md deleted file mode 100644 index 497b26f5e46a62604b7eb542bd0579b2c7fbd358..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/intel-tbb.md +++ /dev/null @@ -1,42 +0,0 @@ -# Intel TBB - -## Intel Threading Building Blocks - -Intel Threading Building Blocks (Intel TBB) is a library that supports scalable parallel programming using standard ISO C++ code. It does not require special languages or compilers. To use the library, you specify tasks, not threads, and let the library map tasks onto threads in an efficient manner. The tasks are executed by a runtime scheduler and may -be offloaded to [MIC accelerator](../intel-xeon-phi/). - -Intel TBB version 4.1 is available on Anselm - -```console -$ ml tbb -``` - -The module sets up environment variables, required for linking and running tbb enabled applications. - -!!! note - Link the tbb library, using -ltbb - -## Examples - -Number of examples, demonstrating use of TBB and its built-in scheduler is available on Anselm, in the $TBB_EXAMPLES directory. - -```console -$ ml intel -$ ml tbb -$ cp -a $TBB_EXAMPLES/common $TBB_EXAMPLES/parallel_reduce /tmp/ -$ cd /tmp/parallel_reduce/primes -$ icc -O2 -DNDEBUG -o primes.x main.cpp primes.cpp -ltbb -$ ./primes.x -``` - -In this example, we compile, link and run the primes example, demonstrating use of parallel task-based reduce in computation of prime numbers. - -You will need the tbb module loaded to run the tbb enabled executable. This may be avoided, by compiling library search paths into the executable. - -```console -$ icc -O2 -o primes.x main.cpp primes.cpp -Wl,-rpath=$LIBRARY_PATH -ltbb -``` - -## Further Reading - -Read more on Intel website, <http://software.intel.com/sites/products/documentation/doclib/tbb_sa/help/index.htm> diff --git a/docs.it4i/anselm/software/intel-suite/introduction.md b/docs.it4i/anselm/software/intel-suite/introduction.md deleted file mode 100644 index 879389f3f119e873d375b585da4e56f0dcfa5a79..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/intel-suite/introduction.md +++ /dev/null @@ -1,62 +0,0 @@ -# Intel Parallel Studio - -The Anselm cluster provides following elements of the Intel Parallel Studio XE - -* Intel Compilers -* Intel Debugger -* Intel MKL Library -* Intel Integrated Performance Primitives Library -* Intel Threading Building Blocks Library - -## Intel Compilers - -The Intel compilers version 13.1.3 are available, via module intel. The compilers include the icc C and C++ compiler and the ifort fortran 77/90/95 compiler. - -```console -$ ml intel -$ icc -v -$ ifort -v -``` - -Read more at the [Intel Compilers](intel-compilers/) page. - -## Intel Debugger - -The intel debugger version 13.0 is available, via module intel. The debugger works for applications compiled with C and C++ compiler and the ifort fortran 77/90/95 compiler. The debugger provides java GUI environment. Use X display for running the GUI. - -```console -$ ml intel -$ idb -``` - -Read more at the [Intel Debugger](intel-debugger/) page. - -## Intel Math Kernel Library - -Intel Math Kernel Library (Intel MKL) is a library of math kernel subroutines, extensively threaded and optimized for maximum performance. Intel MKL unites and provides these basic components: BLAS, LAPACK, ScaLapack, PARDISO, FFT, VML, VSL, Data fitting, Feast Eigensolver and many more. - -```console -$ ml imkl -``` - -Read more at the [Intel MKL](intel-mkl/) page. - -## Intel Integrated Performance Primitives - -Intel Integrated Performance Primitives, version 7.1.1, compiled for AVX is available, via module ipp. The IPP is a library of highly optimized algorithmic building blocks for media and data applications. This includes signal, image and frame processing algorithms, such as FFT, FIR, Convolution, Optical Flow, Hough transform, Sum, MinMax and many more. - -```console -$ ml ipp -``` - -Read more at the [Intel IPP](intel-integrated-performance-primitives/) page. - -## Intel Threading Building Blocks - -Intel Threading Building Blocks (Intel TBB) is a library that supports scalable parallel programming using standard ISO C++ code. It does not require special languages or compilers. It is designed to promote scalable data parallel programming. Additionally, it fully supports nested parallelism, so you can build larger parallel components from smaller parallel components. To use the library, you specify tasks, not threads, and let the library map tasks onto threads in an efficient manner. - -```console -$ ml tbb -``` - -Read more at the [Intel TBB](intel-tbb/) page. diff --git a/docs.it4i/anselm/software/java.md b/docs.it4i/anselm/software/java.md deleted file mode 100644 index a9de126760592f8fdb983242eb397ebf00c80c42..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/java.md +++ /dev/null @@ -1,27 +0,0 @@ -# Java - -## Java on ANSELM - -Java is available on Anselm cluster. Activate java by loading the java module - -```console -$ ml Java -``` - -Note that the java module must be loaded on the compute nodes as well, in order to run java on compute nodes. - -Check for java version and path - -```console -$ java -version -$ which java -``` - -With the module loaded, not only the runtime environment (JRE), but also the development environment (JDK) with the compiler is available. - -```console -$ javac -version -$ which javac -``` - -Java applications may use MPI for inter-process communication, in conjunction with OpenMPI. Read more on <http://www.open-mpi.org/faq/?category=java>. This functionality is currently not supported on Anselm cluster. In case you require the java interface to MPI, please contact [Anselm support](https://support.it4i.cz/rt/). diff --git a/docs.it4i/anselm/software/machine-learning/tensorflow.md b/docs.it4i/anselm/software/machine-learning/tensorflow.md deleted file mode 100644 index b2cb97e7797d62b3e3ffd23ca20f937df5cdd868..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/machine-learning/tensorflow.md +++ /dev/null @@ -1,62 +0,0 @@ -# TensorFlow - -TensorFlow is an open-source software library for machine intelligence. - -## TensorFlow modules - -Anselm provides three different TensorFlow modules: - * Tensorflow/1.1.0 - * Tensorflow/1.1.0-CUDA-7.5.18-Python-3.6.1 - * Tensorflow/1.1.0-CUDA-8.0.44-Python-3.6.1 - -### Tensorflow/1.1.0 (CPU only) - -TensorFlow 1.1 build. - -```console -$ ml Tensorflow/1.1.0 -``` - -This module was built with: - * GCC/4.9.3 - * Python/3.6.1 - -### Tensorflow/1.1.0-CUDA-7.5.18-Python-3.6.1 (GPU enabled) - -TensorFlow 1.1 with GPU support. - -```console -$ ml Tensorflow/1.1.0-CUDA-7.5.18-Python-3.6.1 -``` - -This module was built with: - * GCC/4.9.3 - * Python/3.6.1 - * CUDA/7.5.18 - * cuDNN/5.1-CUDA-7.5.18 - -### Tensorflow/1.1.0-CUDA-8.0.44-Python-3.6.1 (GPU enabled) - -TensorFlow 1.1 with GPU support. - -```console -$ ml Tensorflow/1.1.0-CUDA-8.0.44-Python-3.6.1 -``` - -This module was built with: - * GCC/4.9.3 - * Python/3.6.1 - * CUDA/8.0.44 - * cuDNN/5.1-CUDA-8.0.44 - -## TensorFlow application example - -After loading one of the available TensorFlow modules, you can check the functionality running the following python script. - -```python -import tensorflow as tf - -c = tf.constant('Hello World!') -sess = tf.Session() -print(sess.run(c)) -``` diff --git a/docs.it4i/anselm/software/mpi/mpi.md b/docs.it4i/anselm/software/mpi/mpi.md deleted file mode 100644 index 4313bf513d5262a4b3eba0f1ef10380142f3a2ef..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/mpi/mpi.md +++ /dev/null @@ -1,146 +0,0 @@ -# MPI - -## Setting Up MPI Environment - -The Anselm cluster provides several implementations of the MPI library: - -| MPI Library | Thread support | -| ---------------------------------------------------- | --------------------------------------------------------------- | -| The highly optimized and stable **bullxmpi 1.2.4.1** | Partial thread support up to MPI_THREAD_SERIALIZED | -| The **Intel MPI 4.1** | Full thread support up to MPI_THREAD_MULTIPLE | -| The [OpenMPI 1.6.5](href="http://www.open-mpi.org) | Full thread support up to MPI_THREAD_MULTIPLE, BLCR c/r support | -| The OpenMPI 1.8.1 | Full thread support up to MPI_THREAD_MULTIPLE, MPI-3.0 support | -| The **mpich2 1.9** | Full thread support up to MPI_THREAD_MULTIPLE, BLCR c/r support | - -MPI libraries are activated via the environment modules. - -```console -$ ml av mpi/ - ------------------------- /opt/modules/modulefiles/mpi ------------------------- - bullxmpi/bullxmpi-1.2.4.1 mvapich2/1.9-icc - impi/4.0.3.008 openmpi/1.6.5-gcc(default) - impi/4.1.0.024 openmpi/1.6.5-gcc46 - impi/4.1.0.030 openmpi/1.6.5-icc - impi/4.1.1.036(default) openmpi/1.8.1-gcc - openmpi/1.8.1-gcc46 - mvapich2/1.9-gcc(default) openmpi/1.8.1-gcc49 - mvapich2/1.9-gcc46 openmpi/1.8.1-icc -``` - -There are default compilers associated with any particular MPI implementation. The defaults may be changed, the MPI libraries may be used in conjunction with any compiler. The defaults are selected via the modules in following way - -| Module | MPI | Compiler suite | -| ------------ | ---------------- | ------------------------------------------------------------------------------ | -| PrgEnv-gnu | bullxmpi-1.2.4.1 | bullx GNU 4.4.6 | -| PrgEnv-intel | Intel MPI 4.1.1 | Intel 13.1.1 | -| bullxmpi | bullxmpi-1.2.4.1 | none, select via module | -| impi | Intel MPI 4.1.1 | none, select via module | -| openmpi | OpenMPI 1.6.5 | GNU compilers 4.8.1, GNU compilers 4.4.6, Intel Compilers | -| openmpi | OpenMPI 1.8.1 | GNU compilers 4.8.1, GNU compilers 4.4.6, GNU compilers 4.9.0, Intel Compilers | -| mvapich2 | MPICH2 1.9 | GNU compilers 4.8.1, GNU compilers 4.4.6, Intel Compilers | - -Examples: - -```console -$ ml OpenMPI **or** ml openmpi **for older versions** -``` - -In this example, we activate the latest openmpi with latest GNU compilers - -To use openmpi with the intel compiler suite, use - -```console -$ ml intel -$ ml openmpi/1.6.5-icc -``` - -In this example, the openmpi 1.6.5 using intel compilers is activated - -## Compiling MPI Programs - -!!! note - After setting up your MPI environment, compile your program using one of the mpi wrappers - -```console -$ mpicc -v -$ mpif77 -v -$ mpif90 -v -``` - -Example program: - -```cpp - // helloworld_mpi.c - #include <stdio.h> - - #include<mpi.h> - - int main(int argc, char **argv) { - - int len; - int rank, size; - char node[MPI_MAX_PROCESSOR_NAME]; - - // Initiate MPI - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD,&rank); - MPI_Comm_size(MPI_COMM_WORLD,&size); - - // Get hostame and print - MPI_Get_processor_name(node,&len); - printf("Hello world! from rank %d of %d on host %sn",rank,size,node); - - // Finalize and exit - MPI_Finalize(); - - return 0; - } -``` - -Compile the above example with - -```console -$ mpicc helloworld_mpi.c -o helloworld_mpi.x -``` - -## Running MPI Programs - -!!! note - The MPI program executable must be compatible with the loaded MPI module. - Always compile and execute using the very same MPI module. - -It is strongly discouraged to mix mpi implementations. Linking an application with one MPI implementation and running mpirun/mpiexec form other implementation may result in unexpected errors. - -The MPI program executable must be available within the same path on all nodes. This is automatically fulfilled on the /home and /scratch file system. You need to preload the executable, if running on the local scratch /lscratch file system. - -### Ways to Run MPI Programs - -Optimal way to run an MPI program depends on its memory requirements, memory access pattern and communication pattern. - -!!! note - Consider these ways to run an MPI program: - - 1. One MPI process per node, 16 threads per process - 2. Two MPI processes per node, 8 threads per process - 3. 16 MPI processes per node, 1 thread per process. - -**One MPI** process per node, using 16 threads, is most useful for memory demanding applications, that make good use of processor cache memory and are not memory bound. This is also a preferred way for communication intensive applications as one process per node enjoys full bandwidth access to the network interface. - -**Two MPI** processes per node, using 8 threads each, bound to processor socket is most useful for memory bandwidth bound applications such as BLAS1 or FFT, with scalable memory demand. However, note that the two processes will share access to the network interface. The 8 threads and socket binding should ensure maximum memory access bandwidth and minimize communication, migration and NUMA effect overheads. - -!!! note - Important! Bind every OpenMP thread to a core! - -In the previous two cases with one or two MPI processes per node, the operating system might still migrate OpenMP threads between cores. You want to avoid this by setting the KMP_AFFINITY or GOMP_CPU_AFFINITY environment variables. - -**16 MPI** processes per node, using 1 thread each bound to processor core is most suitable for highly scalable applications with low communication demand. - -### Running OpenMPI - -The **bullxmpi-1.2.4.1** and [**OpenMPI 1.6.5**](http://www.open-mpi.org/) are both based on OpenMPI. Read more on [how to run OpenMPI](Running_OpenMPI/) based MPI. - -### Running MPICH2 - -The **Intel MPI** and **mpich2 1.9** are MPICH2 based implementations. Read more on [how to run MPICH2](running-mpich2/) based MPI. - -The Intel MPI may run on the Intel Xeon Phi accelerators as well. Read more on [how to run Intel MPI on accelerators](../intel-xeon-phi/). diff --git a/docs.it4i/anselm/software/numerical-languages/matlab.md b/docs.it4i/anselm/software/numerical-languages/matlab.md deleted file mode 100644 index ac1b0cc5e6b5728f0079b57b771ec17a219f4d8d..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/numerical-languages/matlab.md +++ /dev/null @@ -1,280 +0,0 @@ -# Matlab - -## Introduction - -Matlab is available in versions R2015a and R2015b. There are always two variants of the release: - -* Non commercial or so called EDU variant, which can be used for common research and educational purposes. -* Commercial or so called COM variant, which can used also for commercial activities. The licenses for commercial variant are much more expensive, so usually the commercial variant has only subset of features compared to the EDU available. - -To load the latest version of Matlab load the module - -```console -$ ml MATLAB -``` - -By default the EDU variant is marked as default. If you need other version or variant, load the particular version. To obtain the list of available versions use - -```console -$ ml av MATLAB -``` - -If you need to use the Matlab GUI to prepare your Matlab programs, you can use Matlab directly on the login nodes. But for all computations use Matlab on the compute nodes via PBS Pro scheduler. - -If you require the Matlab GUI, please follow the general information about [running graphical applications](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). - -Matlab GUI is quite slow using the X forwarding built in the PBS (qsub -X), so using X11 display redirection either via SSH or directly by xauth (please see the "GUI Applications on Compute Nodes over VNC" part [here](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/x-window-system/)) is recommended. - -To run Matlab with GUI, use - -```console -$ matlab -``` - -To run Matlab in text mode, without the Matlab Desktop GUI environment, use - -```console -$ matlab -nodesktop -nosplash -``` - -plots, images, etc... will be still available. - -## Running Parallel Matlab Using Distributed Computing Toolbox / Engine - -!!! note - Distributed toolbox is available only for the EDU variant - -The MPIEXEC mode available in previous versions is no longer available in MATLAB 2015. Also, the programming interface has changed. Refer to [Release Notes](http://www.mathworks.com/help/distcomp/release-notes.html#buanp9e-1). - -Delete previously used file mpiLibConf.m, we have observed crashes when using Intel MPI. - -To use Distributed Computing, you first need to setup a parallel profile. We have provided the profile for you, you can either import it in MATLAB command line: - -```console - >> parallel.importProfile('/apps/all/MATLAB/2015a-EDU/SalomonPBSPro.settings') - - ans = - - SalomonPBSPro -``` - -Or in the GUI, go to tab HOME -> Parallel -> Manage Cluster Profiles..., click Import and navigate to: - -/apps/all/MATLAB/2015a-EDU/SalomonPBSPro.settings - -With the new mode, MATLAB itself launches the workers via PBS, so you can either use interactive mode or a batch mode on one node, but the actual parallel processing will be done in a separate job started by MATLAB itself. Alternatively, you can use "local" mode to run parallel code on just a single node. - -!!! note - The profile is confusingly named Salomon, but you can use it also on Anselm. - -### Parallel Matlab Interactive Session - -Following example shows how to start interactive session with support for Matlab GUI. For more information about GUI based applications on Anselm see [this page](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/x-window-system/). - -```console -$ xhost + -$ qsub -I -v DISPLAY=$(uname -n):$(echo $DISPLAY | cut -d ':' -f 2) -A NONE-0-0 -q qexp -l select=1 -l walltime=00:30:00 -l feature__matlab__MATLAB=1 -``` - -This qsub command example shows how to run Matlab on a single node. - -The second part of the command shows how to request all necessary licenses. In this case 1 Matlab-EDU license and 48 Distributed Computing Engines licenses. - -Once the access to compute nodes is granted by PBS, user can load following modules and start Matlab: - -```console -r1i0n17$ ml MATLAB/2015b-EDU -r1i0n17$ matlab & -``` - -### Parallel Matlab Batch Job in Local Mode - -To run matlab in batch mode, write an matlab script, then write a bash jobscript and execute via the qsub command. By default, matlab will execute one matlab worker instance per allocated core. - -```bash - #!/bin/bash - #PBS -A PROJECT ID - #PBS -q qprod - #PBS -l select=1:ncpus=16:mpiprocs=16:ompthreads=1 - - # change to shared scratch directory - SCR=/scratch/work/user/$USER/$PBS_JOBID - mkdir -p $SCR ; cd $SCR || exit - - # copy input file to scratch - cp $PBS_O_WORKDIR/matlabcode.m . - - # load modules - module load MATLAB/2015a-EDU - - # execute the calculation - matlab -nodisplay -r matlabcode > output.out - - # copy output file to home - cp output.out $PBS_O_WORKDIR/. -``` - -This script may be submitted directly to the PBS workload manager via the qsub command. The inputs and matlab script are in matlabcode.m file, outputs in output.out file. Note the missing .m extension in the matlab -r matlabcodefile call, **the .m must not be included**. Note that the **shared /scratch must be used**. Further, it is **important to include quit** statement at the end of the matlabcode.m script. - -Submit the jobscript using qsub - -```console -$ qsub ./jobscript -``` - -### Parallel Matlab Local Mode Program Example - -The last part of the configuration is done directly in the user Matlab script before Distributed Computing Toolbox is started. - -```console - cluster = parcluster('local') -``` - -This script creates scheduler object "cluster" of type "local" that starts workers locally. - -!!! note - Every Matlab script that needs to initialize/use matlabpool has to contain these three lines prior to calling parpool(sched, ...) function. - -The last step is to start matlabpool with "cluster" object and correct number of workers. We have 24 cores per node, so we start 24 workers. - -```console - parpool(cluster,16); - - - ... parallel code ... - - - parpool close -``` - -The complete example showing how to use Distributed Computing Toolbox in local mode is shown here. - -```console - cluster = parcluster('local'); - cluster - - parpool(cluster,24); - - n=2000; - - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') - - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - T; - whos % T and W are both distributed arrays here. - - parpool close - quit -``` - -You can copy and paste the example in a .m file and execute. Note that the parpool size should correspond to **total number of cores** available on allocated nodes. - -### Parallel Matlab Batch Job Using PBS Mode (Workers Spawned in a Separate Job) - -This mode uses PBS scheduler to launch the parallel pool. It uses the SalomonPBSPro profile that needs to be imported to Cluster Manager, as mentioned before. This methodod uses MATLAB's PBS Scheduler interface - it spawns the workers in a separate job submitted by MATLAB using qsub. - -This is an example of m-script using PBS mode: - -```console - cluster = parcluster('SalomonPBSPro'); - set(cluster, 'SubmitArguments', '-A OPEN-0-0'); - set(cluster, 'ResourceTemplate', '-q qprod -l select=10:ncpus=16'); - set(cluster, 'NumWorkers', 160); - - pool = parpool(cluster, 160); - - n=2000; - - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') - - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - whos % T and W are both distributed arrays here. - - % shut down parallel pool - delete(pool) -``` - -Note that we first construct a cluster object using the imported profile, then set some important options, namely: SubmitArguments, where you need to specify accounting id, and ResourceTemplate, where you need to specify number of nodes to run the job. - -You can start this script using batch mode the same way as in Local mode example. - -### Parallel Matlab Batch With Direct Launch (Workers Spawned Within the Existing Job) - -This method is a "hack" invented by us to emulate the mpiexec functionality found in previous MATLAB versions. We leverage the MATLAB Generic Scheduler interface, but instead of submitting the workers to PBS, we launch the workers directly within the running job, thus we avoid the issues with master script and workers running in separate jobs (issues with license not available, waiting for the worker's job to spawn etc.) - -!!! warning - This method is experimental. - -For this method, you need to use SalomonDirect profile, import it using [the same way as SalomonPBSPro](matlab/#running-parallel-matlab-using-distributed-computing-toolbox---engine) - -This is an example of m-script using direct mode: - -```console - parallel.importProfile('/apps/all/MATLAB/2015a-EDU/SalomonDirect.settings') - cluster = parcluster('SalomonDirect'); - set(cluster, 'NumWorkers', 48); - - pool = parpool(cluster, 48); - - n=2000; - - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') - - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - whos % T and W are both distributed arrays here. - - % shut down parallel pool - delete(pool) -``` - -### Non-Interactive Session and Licenses - -If you want to run batch jobs with Matlab, be sure to request appropriate license features with the PBS Pro scheduler, at least the `-l _feature_matlab_MATLAB=1` for EDU variant of Matlab. More information about how to check the license features states and how to request them with PBS Pro, please [look here](../isv_licenses/). - -In case of non-interactive session please read the [following information](../isv_licenses/) on how to modify the qsub command to test for available licenses prior getting the resource allocation. - -### Matlab Distributed Computing Engines Start Up Time - -Starting Matlab workers is an expensive process that requires certain amount of time. For your information please see the following table: - -| compute nodes | number of workers | start-up time[s] | -| ------------- | ----------------- | ---------------- | -| 16 | 384 | 831 | -| 8 | 192 | 807 | -| 4 | 96 | 483 | -| 2 | 48 | 16 | - -## MATLAB on UV2000 - -UV2000 machine available in queue "qfat" can be used for MATLAB computations. This is a SMP NUMA machine with large amount of RAM, which can be beneficial for certain types of MATLAB jobs. CPU cores are allocated in chunks of 8 for this machine. - -You can use MATLAB on UV2000 in two parallel modes: - -### Threaded Mode - -Since this is a SMP machine, you can completely avoid using Parallel Toolbox and use only MATLAB's threading. MATLAB will automatically detect the number of cores you have allocated and will set maxNumCompThreads accordingly and certain operations, such as fft, , eig, svd, etc. will be automatically run in threads. The advantage of this mode is that you don't need to modify your existing sequential codes. - -### Local Cluster Mode - -You can also use Parallel Toolbox on UV2000. Use l[ocal cluster mode](matlab/#parallel-matlab-batch-job-in-local-mode), "SalomonPBSPro" profile will not work. diff --git a/docs.it4i/anselm/software/numerical-languages/r.md b/docs.it4i/anselm/software/numerical-languages/r.md deleted file mode 100644 index 8916ccb7cc21a1e9bf7de6bda24d1a38bdf82263..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/numerical-languages/r.md +++ /dev/null @@ -1,399 +0,0 @@ -# R - -## Introduction - -The R is a language and environment for statistical computing and graphics. R provides a wide variety of statistical (linear and nonlinear modelling, classical statistical tests, time-series analysis, classification, clustering, ...) and graphical techniques, and is highly extensible. - -One of R's strengths is the ease with which well-designed publication-quality plots can be produced, including mathematical symbols and formulae where needed. Great care has been taken over the defaults for the minor design choices in graphics, but the user retains full control. - -Another convenience is the ease with which the C code or third party libraries may be integrated within R. - -Extensive support for parallel computing is available within R. - -Read more on <http://www.r-project.org/>, <http://cran.r-project.org/doc/manuals/r-release/R-lang.html> - -## Modules - -The R version 3.0.1 is available on Anselm, along with GUI interface Rstudio - -| Application | Version | module | -| ----------- | ------------ | ------- | -| **R** | R 3.0.1 | R | -| **Rstudio** | Rstudio 0.97 | Rstudio | - -```console -$ ml R -``` - -## Execution - -The R on Anselm is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many R kernels, notably the linear algebra subroutines. The R runs these heavy calculation kernels without any penalty. By default, the R would parallelize to 16 threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. - -### Interactive Execution - -To run R interactively, using Rstudio GUI, log in with ssh -X parameter for X11 forwarding. Run rstudio: - -```console -$ ml Rstudio -$ rstudio -``` - -### Batch Execution - -To run R in batch mode, write an R script, then write a bash jobscript and execute via the qsub command. By default, R will use 16 threads when running MKL kernels. - -Example jobscript: - -```bash - #!/bin/bash - - # change to local scratch directory - cd /lscratch/$PBS_JOBID || exit - - # copy input file to scratch - cp $PBS_O_WORKDIR/rscript.R . - - # load R module - module load R - - # execute the calculation - R CMD BATCH rscript.R routput.out - - # copy output file to home - cp routput.out $PBS_O_WORKDIR/. - - #exit - exit -``` - -This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in rscript.R file, outputs in routput.out file. See the single node jobscript example in the [Job execution section](../../job-submission-and-execution/). - -## Parallel R - -Parallel execution of R may be achieved in many ways. One approach is the implied parallelization due to linked libraries or specially enabled functions, as [described above](r/#interactive-execution). In the following sections, we focus on explicit parallelization, where parallel constructs are directly stated within the R script. - -## Package Parallel - -The package parallel provides support for parallel computation, including by forking (taken from package multicore), by sockets (taken from package snow) and random-number generation. - -The package is activated this way: - -```console -$ R - > library(parallel) -``` - -More information and examples may be obtained directly by reading the documentation available in R - -```console - > ?parallel - > library(help = "parallel") - > vignette("parallel") -``` - -Download the package [parallell](package-parallel-vignette.pdf) vignette. - -The forking is the most simple to use. Forking family of functions provide parallelized, drop in replacement for the serial apply() family of functions. - -!!! note - Forking via package parallel provides functionality similar to OpenMP construct - - omp parallel for - - Only cores of single node can be utilized this way! - -Forking example: - -```r - library(parallel) - - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } - - #initialize - size <- detectCores() - - while (TRUE) - { - #read number of intervals - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - - if(n<=0) break - - #run the calculation - n <- max(n,size) - h <- 1.0/n - - i <- seq(1,n); - pi3 <- h*sum(simplify2array(mclapply(i,f,h,mc.cores=size))); - - #print results - cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } -``` - -The above example is the classic parallel example for calculating the number Ď€. Note the **detectCores()** and **mclapply()** functions. Execute the example as: - -```console -$ R --slave --no-save --no-restore -f pi3p.R -``` - -Every evaluation of the integrad function runs in parallel on different process. - -## Package Rmpi - -!!! note - package Rmpi provides an interface (wrapper) to MPI APIs. - -It also provides interactive R slave environment. On Anselm, Rmpi provides interface to the [OpenMPI](../mpi-1/Running_OpenMPI/). - -Read more on Rmpi at <http://cran.r-project.org/web/packages/Rmpi/>, reference manual is available at <http://cran.r-project.org/web/packages/Rmpi/Rmpi.pdf> - -When using package Rmpi, both openmpi and R modules must be loaded - -```console -$ ml OpenMPI -$ ml R -``` - -Rmpi may be used in three basic ways. The static approach is identical to executing any other MPI programm. In addition, there is Rslaves dynamic MPI approach and the mpi.apply approach. In the following section, we will use the number Ď€ integration example, to illustrate all these concepts. - -### Static Rmpi - -Static Rmpi programs are executed via mpiexec, as any other MPI programs. Number of processes is static - given at the launch time. - -Static Rmpi example: - -```r - library(Rmpi) - - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } - - #initialize - invisible(mpi.comm.dup(0,1)) - rank <- mpi.comm.rank() - size <- mpi.comm.size() - n<-0 - - while (TRUE) - { - #read number of intervals - if (rank==0) { - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - } - - #broadcat the intervals - n <- mpi.bcast(as.integer(n),type=1) - - if(n<=0) break - - #run the calculation - n <- max(n,size) - h <- 1.0/n - - i <- seq(rank+1,n,size); - mypi <- h*sum(sapply(i,f,h)); - - pi3 <- mpi.reduce(mypi) - - #print results - if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } - - mpi.quit() -``` - -The above is the static MPI example for calculating the number Ď€. Note the **library(Rmpi)** and **mpi.comm.dup()** function calls. - -Execute the example as: - -```console -$ mpiexec R --slave --no-save --no-restore -f pi3.R -``` - -### Dynamic Rmpi - -Dynamic Rmpi programs are executed by calling the R directly. openmpi module must be still loaded. The R slave processes will be spawned by a function call within the Rmpi program. - -Dynamic Rmpi example: - -```r - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } - - #the worker function - workerpi <- function() - { - #initialize - rank <- mpi.comm.rank() - size <- mpi.comm.size() - n<-0 - - while (TRUE) - { - #read number of intervals - if (rank==0) { - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - } - - #broadcat the intervals - n <- mpi.bcast(as.integer(n),type=1) - - if(n<=0) break - - #run the calculation - n <- max(n,size) - h <- 1.0/n - - i <- seq(rank+1,n,size); - mypi <- h*sum(sapply(i,f,h)); - - pi3 <- mpi.reduce(mypi) - - #print results - if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } - } - - #main - library(Rmpi) - - cat("Enter the number of slaves: ") - fp<-file("stdin"); ns<-scan(fp,nmax=1); close(fp) - - mpi.spawn.Rslaves(nslaves=ns) - mpi.bcast.Robj2slave(f) - mpi.bcast.Robj2slave(workerpi) - - mpi.bcast.cmd(workerpi()) - workerpi() - - mpi.quit() -``` - -The above example is the dynamic MPI example for calculating the number Ď€. Both master and slave processes carry out the calculation. Note the mpi.spawn.Rslaves(), mpi.bcast.Robj2slave()** and the mpi.bcast.cmd()** function calls. - -Execute the example as: - -```console -$ R --slave --no-save --no-restore -f pi3Rslaves.R -``` - -### mpi.apply Rmpi - -mpi.apply is a specific way of executing Dynamic Rmpi programs. - -!!! note - mpi.apply() family of functions provide MPI parallelized, drop in replacement for the serial apply() family of functions. - -Execution is identical to other dynamic Rmpi programs. - -mpi.apply Rmpi example: - -```r - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } - - #the worker function - workerpi <- function(rank,size,n) - { - #run the calculation - n <- max(n,size) - h <- 1.0/n - - i <- seq(rank,n,size); - mypi <- h*sum(sapply(i,f,h)); - - return(mypi) - } - - #main - library(Rmpi) - - cat("Enter the number of slaves: ") - fp<-file("stdin"); ns<-scan(fp,nmax=1); close(fp) - - mpi.spawn.Rslaves(nslaves=ns) - mpi.bcast.Robj2slave(f) - mpi.bcast.Robj2slave(workerpi) - - while (TRUE) - { - #read number of intervals - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - if(n<=0) break - - #run workerpi - i=seq(1,2*ns) - pi3=sum(mpi.parSapply(i,workerpi,2*ns,n)) - - #print results - cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } - - mpi.quit() -``` - -The above is the mpi.apply MPI example for calculating the number Ď€. Only the slave processes carry out the calculation. Note the **mpi.parSapply()**, function call. The package parallel [example](r/#package-parallel)[above](r/#package-parallel) may be trivially adapted (for much better performance) to this structure using the mclapply() in place of mpi.parSapply(). - -Execute the example as: - -```console -$ R --slave --no-save --no-restore -f pi3parSapply.R -``` - -## Combining Parallel and Rmpi - -Currently, the two packages can not be combined for hybrid calculations. - -## Parallel Execution - -The R parallel jobs are executed via the PBS queue system exactly as any other parallel jobs. User must create an appropriate jobscript and submit via the **qsub** - -Example jobscript for [static Rmpi](r/#static-rmpi) parallel R execution, running 1 process per core: - -```bash - #!/bin/bash - #PBS -q qprod - #PBS -N Rjob - #PBS -l select=100:ncpus=16:mpiprocs=16:ompthreads=1 - - # change to scratch directory - SCRDIR=/scratch/$USER/myjob - cd $SCRDIR || exit - - # copy input file to scratch - cp $PBS_O_WORKDIR/rscript.R . - - # load R and openmpi module - module load R - module load openmpi - - # execute the calculation - mpiexec -bycore -bind-to-core R --slave --no-save --no-restore -f rscript.R - - # copy output file to home - cp routput.out $PBS_O_WORKDIR/. - - #exit - exit -``` - -For more information about jobscript and MPI execution refer to the [Job submission](../../job-submission-and-execution/) and general [MPI](../mpi/mpi/) sections. diff --git a/docs.it4i/anselm/software/nvidia-cuda.md b/docs.it4i/anselm/software/nvidia-cuda.md index 6b06d9384302e0e023f807dcb2eb983a11b3b73a..91251e132e59d3c86d1b60169f7da82cfae2fcee 100644 --- a/docs.it4i/anselm/software/nvidia-cuda.md +++ b/docs.it4i/anselm/software/nvidia-cuda.md @@ -91,92 +91,92 @@ Expected output of the deviceQuery example executed on a node with Tesla K20m is In this section we provide a basic CUDA based vector addition code example. You can directly copy and paste the code to test it. -```console +```cpp $ vim test.cu - #define N (2048*2048) - #define THREADS_PER_BLOCK 512 - - #include <stdio.h> - #include <stdlib.h> - - // GPU kernel function to add two vectors - __global__ void add_gpu( int *a, int *b, int *c, int n){ - int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < n) - c[index] = a[index] + b[index]; +#define N (2048*2048) +#define THREADS_PER_BLOCK 512 + +#include <stdio.h> +#include <stdlib.h> + +// GPU kernel function to add two vectors +__global__ void add_gpu( int *a, int *b, int *c, int n){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n) + c[index] = a[index] + b[index]; +} + +// CPU function to add two vectors +void add_cpu (int *a, int *b, int *c, int n) { + for (int i=0; i < n; i++) + c[i] = a[i] + b[i]; +} + +// CPU function to generate a vector of random integers +void random_ints (int *a, int n) { + for (int i = 0; i < n; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 +} + +// CPU function to compare two vectors +int compare_ints( int *a, int *b, int n ){ + int pass = 0; + for (int i = 0; i < N; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; +} - // CPU function to add two vectors - void add_cpu (int *a, int *b, int *c, int n) { - for (int i=0; i < n; i++) - c[i] = a[i] + b[i]; - } +int main( void ) { - // CPU function to generate a vector of random integers - void random_ints (int *a, int n) { - for (int i = 0; i < n; i++) - a[i] = rand() % 10000; // random number between 0 and 9999 - } + int *a, *b, *c; // host copies of a, b, c + int *dev_a, *dev_b, *dev_c; // device copies of a, b, c + int size = N * sizeof( int ); // we need space for N integers - // CPU function to compare two vectors - int compare_ints( int *a, int *b, int n ){ - int pass = 0; - for (int i = 0; i < N; i++){ - if (a[i] != b[i]) { - printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); - pass = 1; - } - } - if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); - return pass; - } - - int main( void ) { - - int *a, *b, *c; // host copies of a, b, c - int *dev_a, *dev_b, *dev_c; // device copies of a, b, c - int size = N * sizeof( int ); // we need space for N integers + // Allocate GPU/device copies of dev_a, dev_b, dev_c + cudaMalloc( (void**)&dev_a, size ); + cudaMalloc( (void**)&dev_b, size ); + cudaMalloc( (void**)&dev_c, size ); - // Allocate GPU/device copies of dev_a, dev_b, dev_c - cudaMalloc( (void**)&dev_a, size ); - cudaMalloc( (void**)&dev_b, size ); - cudaMalloc( (void**)&dev_c, size ); + // Allocate CPU/host copies of a, b, c + a = (int*)malloc( size ); + b = (int*)malloc( size ); + c = (int*)malloc( size ); - // Allocate CPU/host copies of a, b, c - a = (int*)malloc( size ); - b = (int*)malloc( size ); - c = (int*)malloc( size ); + // Fill input vectors with random integer numbers + random_ints( a, N ); + random_ints( b, N ); - // Fill input vectors with random integer numbers - random_ints( a, N ); - random_ints( b, N ); + // copy inputs to device + cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice ); + cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice ); - // copy inputs to device - cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice ); - cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice ); + // launch add_gpu() kernel with blocks and threads + add_gpu<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>( dev_a, dev_b, dev_c, N ); - // launch add_gpu() kernel with blocks and threads - add_gpu<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>( dev_a, dev_b, dev_c, N ); + // copy device result back to host copy of c + cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost ); - // copy device result back to host copy of c - cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost ); + //Check the results with CPU implementation + int *c_h; c_h = (int*)malloc( size ); + add_cpu (a, b, c_h, N); + compare_ints(c, c_h, N); - //Check the results with CPU implementation - int *c_h; c_h = (int*)malloc( size ); - add_cpu (a, b, c_h, N); - compare_ints(c, c_h, N); + // Clean CPU memory allocations + free( a ); free( b ); free( c ); free (c_h); - // Clean CPU memory allocations - free( a ); free( b ); free( c ); free (c_h); + // Clean GPU memory allocations + cudaFree( dev_a ); + cudaFree( dev_b ); + cudaFree( dev_c ); - // Clean GPU memory allocations - cudaFree( dev_a ); - cudaFree( dev_b ); - cudaFree( dev_c ); - - return 0; - } + return 0; +} ``` This code can be compiled using following command @@ -204,81 +204,81 @@ The NVIDIA CUDA Basic Linear Algebra Subroutines (cuBLAS) library is a GPU-accel SAXPY function multiplies the vector x by the scalar alpha and adds it to the vector y overwriting the latest vector with the result. The description of the cuBLAS function can be found in [NVIDIA CUDA documentation](http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-axpy "Nvidia CUDA documentation "). Code can be pasted in the file and compiled without any modification. ```cpp - /* Includes, system */ - #include <stdio.h> - #include <stdlib.h> - - /* Includes, cuda */ - #include <cuda_runtime.h> - #include <cublas_v2.h> - - /* Vector size */ - #define N (32) - - /* Host implementation of a simple version of saxpi */ - void saxpy(int n, float alpha, const float *x, float *y) +/* Includes, system */ +#include <stdio.h> +#include <stdlib.h> + +/* Includes, cuda */ +#include <cuda_runtime.h> +#include <cublas_v2.h> + +/* Vector size */ +#define N (32) + +/* Host implementation of a simple version of saxpi */ +void saxpy(int n, float alpha, const float *x, float *y) +{ + for (int i = 0; i < n; ++i) + y[i] = alpha*x[i] + y[i]; +} + +/* Main */ +int main(int argc, char **argv) +{ + float *h_X, *h_Y, *h_Y_ref; + float *d_X = 0; + float *d_Y = 0; + + const float alpha = 1.0f; + int i; + + cublasHandle_t handle; + + /* Initialize CUBLAS */ + printf("simpleCUBLAS test running..n"); + cublasCreate(&handle); + + /* Allocate host memory for the matrices */ + h_X = (float *)malloc(N * sizeof(h_X[0])); + h_Y = (float *)malloc(N * sizeof(h_Y[0])); + h_Y_ref = (float *)malloc(N * sizeof(h_Y_ref[0])); + + /* Fill the matrices with test data */ + for (i = 0; i < N; i++) { - for (int i = 0; i < n; ++i) - y[i] = alpha*x[i] + y[i]; + h_X[i] = rand() / (float)RAND_MAX; + h_Y[i] = rand() / (float)RAND_MAX; + h_Y_ref[i] = h_Y[i]; } - /* Main */ - int main(int argc, char **argv) - { - float *h_X, *h_Y, *h_Y_ref; - float *d_X = 0; - float *d_Y = 0; - - const float alpha = 1.0f; - int i; - - cublasHandle_t handle; + /* Allocate device memory for the matrices */ + cudaMalloc((void **)&d_X, N * sizeof(d_X[0])); + cudaMalloc((void **)&d_Y, N * sizeof(d_Y[0])); - /* Initialize CUBLAS */ - printf("simpleCUBLAS test running..n"); - cublasCreate(&handle); + /* Initialize the device matrices with the host matrices */ + cublasSetVector(N, sizeof(h_X[0]), h_X, 1, d_X, 1); + cublasSetVector(N, sizeof(h_Y[0]), h_Y, 1, d_Y, 1); - /* Allocate host memory for the matrices */ - h_X = (float *)malloc(N * sizeof(h_X[0])); - h_Y = (float *)malloc(N * sizeof(h_Y[0])); - h_Y_ref = (float *)malloc(N * sizeof(h_Y_ref[0])); + /* Performs operation using plain C code */ + saxpy(N, alpha, h_X, h_Y_ref); - /* Fill the matrices with test data */ - for (i = 0; i < N; i++) - { - h_X[i] = rand() / (float)RAND_MAX; - h_Y[i] = rand() / (float)RAND_MAX; - h_Y_ref[i] = h_Y[i]; - } + /* Performs operation using cublas */ + cublasSaxpy(handle, N, &alpha, d_X, 1, d_Y, 1); - /* Allocate device memory for the matrices */ - cudaMalloc((void **)&d_X, N * sizeof(d_X[0])); - cudaMalloc((void **)&d_Y, N * sizeof(d_Y[0])); + /* Read the result back */ + cublasGetVector(N, sizeof(h_Y[0]), d_Y, 1, h_Y, 1); - /* Initialize the device matrices with the host matrices */ - cublasSetVector(N, sizeof(h_X[0]), h_X, 1, d_X, 1); - cublasSetVector(N, sizeof(h_Y[0]), h_Y, 1, d_Y, 1); + /* Check result against reference */ + for (i = 0; i < N; ++i) + printf("CPU res = %f t GPU res = %f t diff = %f n", h_Y_ref[i], h_Y[i], h_Y_ref[i] - h_Y[i]); - /* Performs operation using plain C code */ - saxpy(N, alpha, h_X, h_Y_ref); + /* Memory clean up */ + free(h_X); free(h_Y); free(h_Y_ref); + cudaFree(d_X); cudaFree(d_Y); - /* Performs operation using cublas */ - cublasSaxpy(handle, N, &alpha, d_X, 1, d_Y, 1); - - /* Read the result back */ - cublasGetVector(N, sizeof(h_Y[0]), d_Y, 1, h_Y, 1); - - /* Check result against reference */ - for (i = 0; i < N; ++i) - printf("CPU res = %f t GPU res = %f t diff = %f n", h_Y_ref[i], h_Y[i], h_Y_ref[i] - h_Y[i]); - - /* Memory clean up */ - free(h_X); free(h_Y); free(h_Y_ref); - cudaFree(d_X); cudaFree(d_Y); - - /* Shutdown */ - cublasDestroy(handle); - } + /* Shutdown */ + cublasDestroy(handle); +} ``` !!! note diff --git a/docs.it4i/anselm/software/operating-system.md b/docs.it4i/anselm/software/operating-system.md deleted file mode 100644 index e43800e0d038882270620ccb8e95d50df94a5b71..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/operating-system.md +++ /dev/null @@ -1,3 +0,0 @@ -# Operating System - -The operating system on Anselm is Linux - [**Red Hat Enterprise Linux release 6.x**](https://en.wikipedia.org/wiki/Red_Hat_Enterprise_Linux). diff --git a/docs.it4i/anselm/software/paraview.md b/docs.it4i/anselm/software/paraview.md deleted file mode 100644 index 830ce72a26c0e6a22683534e12444392040c1a58..0000000000000000000000000000000000000000 --- a/docs.it4i/anselm/software/paraview.md +++ /dev/null @@ -1,85 +0,0 @@ -# ParaView - -Open-Source, Multi-Platform Data Analysis and Visualization Application - -## Introduction - -**ParaView** is an open-source, multi-platform data analysis and visualization application. ParaView users can quickly build visualizations to analyze their data using qualitative and quantitative techniques. The data exploration can be done interactively in 3D or programmatically using ParaView's batch processing capabilities. - -ParaView was developed to analyze extremely large datasets using distributed memory computing resources. It can be run on supercomputers to analyze datasets of exascale size as well as on laptops for smaller data. - -Homepage : <http://www.paraview.org/> - -## Installed Version - -Currently, version 5.1.2 compiled with intel/2017a against intel MPI library and OSMesa 12.0.2 is installed on Anselm. - -## Usage - -On Anselm, ParaView is to be used in client-server mode. A parallel ParaView server is launched on compute nodes by the user, and client is launched on your desktop PC to control and view the visualization. Download ParaView client application for your OS here: <http://paraview.org/paraview/resources/software.php>. - -!!!Warning - Your version must match the version number installed on Anselm. - -### Launching Server - -To launch the server, you must first allocate compute nodes, for example - -```console -$ qsub -I -q qprod -A OPEN-0-0 -l select=2 -``` - -to launch an interactive session on 2 nodes. Refer to [Resource Allocation and Job Execution](../job-submission-and-execution/) for details. - -After the interactive session is opened, load the ParaView module : - -```console -$ ml ParaView/5.1.2-intel-2017a-mpi -``` - -Now launch the parallel server, with number of nodes times 16 processes: - -```console -$ mpirun -np 32 pvserver --use-offscreen-rendering - Waiting for client... - Connection URL: cs://cn77:11111 - Accepting connection(s): cn77:11111 -``` - -Note the that the server is listening on compute node cn77 in this case, we shall use this information later. - -### Client Connection - -Because a direct connection is not allowed to compute nodes on Anselm, you must establish a SSH tunnel to connect to the server. Choose a port number on your PC to be forwarded to ParaView server, for example 12345. If your PC is running Linux, use this command to establish a SSH tunnel: - -```console -$ ssh -TN -L 12345:cn77:11111 username@anselm.it4i.cz -``` - -replace username with your login and cn77 with the name of compute node your ParaView server is running on (see previous step). - -If you use PuTTY on Windows, load Anselm connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding. - -Fill the Source port and Destination fields. **Do not forget to click the Add button.** - - - -Now launch ParaView client installed on your desktop PC. Select *File* -> *Connect*... and fill in the following : - - - -The configuration is now saved for later use. Now click Connect to connect to the ParaView server. In your terminal where you have interactive session with ParaView server launched, you should see: - -```console -Client connected. -``` - -You can now use Parallel ParaView. - -### Close Server - -Remember to close the interactive session after you finish working with ParaView server, as it will remain launched even after your client is disconnected and will continue to consume resources. - -## GPU Support - -Currently, GPU acceleration is not supported in the server and ParaView will not take advantage of accelerated nodes on Anselm. Support for GPU acceleration might be added in the future. diff --git a/docs.it4i/general/accessing-the-clusters/graphical-user-interface/vnc.md b/docs.it4i/general/accessing-the-clusters/graphical-user-interface/vnc.md index b2fa2f58dd0a04e2e5ace8e3035dc5b95fc4a1b5..9647272d1a52a3a29e17687335634a8360795d02 100644 --- a/docs.it4i/general/accessing-the-clusters/graphical-user-interface/vnc.md +++ b/docs.it4i/general/accessing-the-clusters/graphical-user-interface/vnc.md @@ -62,6 +62,7 @@ Another command: username 10296 0.0 0.0 131772 21076 pts/29 SN 13:01 0:01 /usr/bin/Xvnc :61 -desktop login2:61 (username) -auth /home/vop999/.Xauthority -geometry 1600x900 -depth 16 -rfbwait 30000 -rfbauth /home/username/.vnc/passwd -rfbport 5961 -fp catalogue:/etc/X11/fontpath.d -pn ``` + !!! note The vncserver runs on port 5900 + display number. You get your port number simply as 5900 + display number (in this example 61), so the result is 5961. diff --git a/docs.it4i/salomon/resources-allocation-policy.md b/docs.it4i/salomon/resources-allocation-policy.md index 28e543361ce7c544d37750e90dbef05c2243c2b5..217f394748b3da84418eca8ef34450e40b2a135b 100644 --- a/docs.it4i/salomon/resources-allocation-policy.md +++ b/docs.it4i/salomon/resources-allocation-policy.md @@ -113,6 +113,4 @@ Options: ---8<--- "resource_accounting.md" - - ---8<--- "mathjax.md" diff --git a/docs.it4i/salomon/software/ansys/ansys-cfx.md b/docs.it4i/salomon/software/ansys/ansys-cfx.md deleted file mode 100644 index 21ce8f93b16958a184d15af5235830e9d39406b9..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/ansys/ansys-cfx.md +++ /dev/null @@ -1,55 +0,0 @@ -# ANSYS CFX - -[ANSYS CFX](http://www.ansys.com/products/fluids/ansys-cfx) software is a high-performance, general purpose fluid dynamics program that has been applied to solve wide-ranging fluid flow problems for over 20 years. At the heart of ANSYS CFX is its advanced solver technology, the key to achieving reliable and accurate solutions quickly and robustly. The modern, highly parallelized solver is the foundation for an abundant choice of physical models to capture virtually any type of phenomena related to fluid flow. The solver and its many physical models are wrapped in a modern, intuitive, and flexible GUI and user environment, with extensive capabilities for customization and automation using session files, scripting and a powerful expression language. - -To run ANSYS CFX in batch mode you can utilize/modify the default cfx.pbs script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -l nodes=2:ppn=16 -#PBS -q qprod -#PBS -N $USER-CFX-Project -#PBS -A XX-YY-ZZ - -#! Mail to user when job terminate or abort -#PBS -m ae - -#!change the working directory (default is home directory) -#cd <working directory> (working directory must exists) -WORK_DIR="/scratch/$USER/work" -cd $WORK_DIR - -echo Running on host `hostname` -echo Time is `date` -echo Directory is `pwd` -echo This jobs runs on the following processors: -echo `cat $PBS_NODEFILE` - -module load ansys - -#### Set number of processors per host listing -#### (set to 1 as $PBS_NODEFILE lists each node twice if :ppn=2) -procs_per_host=1 -#### Create host list -hl="" -for host in `cat $PBS_NODEFILE` -do - if ["$hl" = "" ] - then hl="$host:$procs_per_host" - else hl="${hl}:$host:$procs_per_host" - fi -done - -echo Machines: $hl - -#-dev input.def includes the input of CFX analysis in DEF format -#-P the name of prefered license feature (aa_r=ANSYS Academic Research, ane3fl=Multiphysics(commercial)) -/ansys_inc/v145/CFX/bin/cfx5solve -def input.def -size 4 -size-ni 4x -part-large -start-method "Platform MPI Distributed Parallel" -par-dist $hl -P aa_r -``` - -Header of the pbs file (above) is common and description can be find on [this site](../../job-submission-and-execution/). SVS FEM recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. - -Working directory has to be created before sending pbs job into the queue. Input file should be in working directory or full path to input file has to be specified. >Input file has to be defined by common CFX def file which is attached to the cfx solver via parameter -def - -**License** should be selected by parameter -P (Big letter **P**). Licensed products are the following: aa_r (ANSYS **Academic** Research), ane3fl (ANSYS Multiphysics)-**Commercial**. -[More about licensing here](licensing/) diff --git a/docs.it4i/salomon/software/ansys/ansys-fluent.md b/docs.it4i/salomon/software/ansys/ansys-fluent.md deleted file mode 100644 index 27469a1c559355d1347ba3cfd76e303893caeb38..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/ansys/ansys-fluent.md +++ /dev/null @@ -1,162 +0,0 @@ -# ANSYS Fluent - -[ANSYS Fluent](http://www.ansys.com/products/fluids/ansys-fluent) -software contains the broad physical modeling capabilities needed to model flow, turbulence, heat transfer, and reactions for industrial applications ranging from air flow over an aircraft wing to combustion in a furnace, from bubble columns to oil platforms, from blood flow to semiconductor manufacturing, and from clean room design to wastewater treatment plants. Special models that give the software the ability to model in-cylinder combustion, aeroacoustics, turbomachinery, and multiphase systems have served to broaden its reach. - -1. Common way to run Fluent over pbs file - -To run ANSYS Fluent in batch mode you can utilize/modify the default fluent.pbs script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -S /bin/bash -#PBS -l nodes=2:ppn=16 -#PBS -q qprod -#PBS -N $USER-Fluent-Project -#PBS -A XX-YY-ZZ - -#! Mail to user when job terminate or abort -#PBS -m ae - -#!change the working directory (default is home directory) -#cd <working directory> (working directory must exists) -WORK_DIR="/scratch/$USER/work" -cd $WORK_DIR - -echo Running on host `hostname` -echo Time is `date` -echo Directory is `pwd` -echo This jobs runs on the following processors: -echo `cat $PBS_NODEFILE` - -#### Load ansys module so that we find the cfx5solve command -module load ansys - -# Use following line to specify MPI for message-passing instead -NCORES=`wc -l $PBS_NODEFILE |awk '{print $1}'` - -/ansys_inc/v145/fluent/bin/fluent 3d -t$NCORES -cnf=$PBS_NODEFILE -g -i fluent.jou -``` - -Header of the pbs file (above) is common and description can be find on [this site](../../resources-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. - -Working directory has to be created before sending pbs job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common Fluent journal file which is attached to the Fluent solver via parameter -i fluent.jou - -Journal file with definition of the input geometry and boundary conditions and defined process of solution has e.g. the following structure: - -```console - /file/read-case aircraft_2m.cas.gz - /solve/init - init - /solve/iterate - 10 - /file/write-case-dat aircraft_2m-solution - /exit yes -``` - -The appropriate dimension of the problem has to be set by parameter (2d/3d). - -1. Fast way to run Fluent from command line - -```console -fluent solver_version [FLUENT_options] -i journal_file -pbs -``` - -This syntax will start the ANSYS FLUENT job under PBS Professional using the qsub command in a batch manner. When resources are available, PBS Professional will start the job and return a job ID, usually in the form of _job_ID.hostname_. This job ID can then be used to query, control, or stop the job using standard PBS Professional commands, such as qstat or qdel. The job will be run out of the current working directory, and all output will be written to the file fluent.o _job_ID_. - -1. Running Fluent via user's config file - -The sample script uses a configuration file called pbs_fluent.conf if no command line arguments are present. This configuration file should be present in the directory from which the jobs are submitted (which is also the directory in which the jobs are executed). The following is an example of what the content of pbs_fluent.conf can be: - -```console -input="example_small.flin" -case="Small-1.65m.cas" -fluent_args="3d -pmyrinet" -outfile="fluent_test.out" -mpp="true" -``` - -The following is an explanation of the parameters: - -input is the name of the input file. - -case is the name of the .cas file that the input file will utilize. - -fluent_args are extra ANSYS FLUENT arguments. As shown in the previous example, you can specify the interconnect by using the -p interconnect command. The available interconnects include ethernet (the default), myrinet, infiniband, vendor, altix, and crayx. The MPI is selected automatically, based on the specified interconnect. - -outfile is the name of the file to which the standard output will be sent. - -mpp="true" will tell the job script to execute the job across multiple processors. - -To run ANSYS Fluent in batch mode with user's config file you can utilize/modify the following script and execute it via the qsub command. - -```bash -#!/bin/sh -#PBS -l nodes=2:ppn=4 -#PBS -1 qprod -#PBS -N $USE-Fluent-Project -#PBS -A XX-YY-ZZ - - cd $PBS_O_WORKDIR - - #We assume that if they didn’t specify arguments then they should use the - #config file if ["xx${input}${case}${mpp}${fluent_args}zz" = "xxzz" ]; then - if [ -f pbs_fluent.conf ]; then - . pbs_fluent.conf - else - printf "No command line arguments specified, " - printf "and no configuration file found. Exiting n" - fi - fi - - - #Augment the ANSYS FLUENT command line arguments case "$mpp" in - true) - #MPI job execution scenario - num_nodes=â€cat $PBS_NODEFILE | sort -u | wc -l†- cpus=â€expr $num_nodes * $NCPUS†- #Default arguments for mpp jobs, these should be changed to suit your - #needs. - fluent_args="-t${cpus} $fluent_args -cnf=$PBS_NODEFILE" - ;; - *) - #SMP case - #Default arguments for smp jobs, should be adjusted to suit your - #needs. - fluent_args="-t$NCPUS $fluent_args" - ;; - esac - #Default arguments for all jobs - fluent_args="-ssh -g -i $input $fluent_args" - - echo "---------- Going to start a fluent job with the following settings: - Input: $input - Case: $case - Output: $outfile - Fluent arguments: $fluent_args" - - #run the solver - /ansys_inc/v145/fluent/bin/fluent $fluent_args > $outfile -``` - -It runs the jobs out of the directory from which they are submitted (PBS_O_WORKDIR). - -1. Running Fluent in parralel - -Fluent could be run in parallel only under Academic Research license. To do so this ANSYS Academic Research license must be placed before ANSYS CFD license in user preferences. To make this change anslic_admin utility should be run - -```console -/ansys_inc/shared_les/licensing/lic_admin/anslic_admin -``` - -ANSLIC_ADMIN Utility will be run - - - - - - - -ANSYS Academic Research license should be moved up to the top of the list. - - diff --git a/docs.it4i/salomon/software/ansys/ansys-ls-dyna.md b/docs.it4i/salomon/software/ansys/ansys-ls-dyna.md deleted file mode 100644 index 8646c26665ea9f10d6d70405e961f1e2efe7fbb9..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/ansys/ansys-ls-dyna.md +++ /dev/null @@ -1,55 +0,0 @@ -# ANSYS LS-DYNA - -**[ANSYSLS-DYNA](http://www.ansys.com/products/structures/ansys-ls-dyna)** software provides convenient and easy-to-use access to the technology-rich, time-tested explicit solver without the need to contend with the complex input requirements of this sophisticated program. Introduced in 1996, ANSYS LS-DYNA capabilities have helped customers in numerous industries to resolve highly intricate design issues. ANSYS Mechanical users have been able take advantage of complex explicit solutions for a long time utilizing the traditional ANSYS Parametric Design Language (APDL) environment. These explicit capabilities are available to ANSYS Workbench users as well. The Workbench platform is a powerful, comprehensive, easy-to-use environment for engineering simulation. CAD import from all sources, geometry cleanup, automatic meshing, solution, parametric optimization, result visualization and comprehensive report generation are all available within a single fully interactive modern graphical user environment. - -To run ANSYS LS-DYNA in batch mode you can utilize/modify the default ansysdyna.pbs script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -l nodes=2:ppn=16 -#PBS -q qprod -#PBS -N $USER-DYNA-Project -#PBS -A XX-YY-ZZ - -#! Mail to user when job terminate or abort -#PBS -m ae - -#!change the working directory (default is home directory) -#cd <working directory> -WORK_DIR="/scratch/$USER/work" -cd $WORK_DIR - -echo Running on host `hostname` -echo Time is `date` -echo Directory is `pwd` -echo This jobs runs on the following processors: -echo `cat $PBS_NODEFILE` - -#! Counts the number of processors -NPROCS=`wc -l < $PBS_NODEFILE` - -echo This job has allocated $NPROCS nodes - -module load ansys - -#### Set number of processors per host listing -#### (set to 1 as $PBS_NODEFILE lists each node twice if :ppn=2) -procs_per_host=1 -#### Create host list -hl="" -for host in `cat $PBS_NODEFILE` -do - if ["$hl" = "" ] - then hl="$host:$procs_per_host" - else hl="${hl}:$host:$procs_per_host" - fi -done - -echo Machines: $hl - -/ansys_inc/v145/ansys/bin/ansys145 -dis -lsdynampp i=input.k -machines $hl -``` - -Header of the pbs file (above) is common and description can be find on [this site](../../job-submission-and-execution/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. - -Working directory has to be created before sending pbs job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common LS-DYNA .**k** file which is attached to the ansys solver via parameter i= diff --git a/docs.it4i/salomon/software/ansys/ansys-mechanical-apdl.md b/docs.it4i/salomon/software/ansys/ansys-mechanical-apdl.md deleted file mode 100644 index c1562c1c23ca09fe308536c45f1c903ab8384b3e..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/ansys/ansys-mechanical-apdl.md +++ /dev/null @@ -1,56 +0,0 @@ -# ANSYS MAPDL - -**[ANSYS Multiphysics](http://www.ansys.com/products/multiphysics)** -software offers a comprehensive product solution for both multiphysics and single-physics analysis. The product includes structural, thermal, fluid and both high- and low-frequency electromagnetic analysis. The product also contains solutions for both direct and sequentially coupled physics problems including direct coupled-field elements and the ANSYS multi-field solver. - -To run ANSYS MAPDL in batch mode you can utilize/modify the default mapdl.pbs script and execute it via the qsub command. - -```bash -#!/bin/bash -#PBS -l nodes=2:ppn=16 -#PBS -q qprod -#PBS -N $USER-ANSYS-Project -#PBS -A XX-YY-ZZ - -#! Mail to user when job terminate or abort -#PBS -m ae - -#!change the working directory (default is home directory) -#cd <working directory> (working directory must exists) -WORK_DIR="/scratch/$USER/work" -cd $WORK_DIR - -echo Running on host `hostname` -echo Time is `date` -echo Directory is `pwd` -echo This jobs runs on the following processors: -echo `cat $PBS_NODEFILE` - -module load ansys - -#### Set number of processors per host listing -#### (set to 1 as $PBS_NODEFILE lists each node twice if :ppn=2) -procs_per_host=1 -#### Create host list -hl="" -for host in `cat $PBS_NODEFILE` -do - if ["$hl" = "" ] - then hl="$host:$procs_per_host" - else hl="${hl}:$host:$procs_per_host" - fi -done - -echo Machines: $hl - -#-i input.dat includes the input of analysis in APDL format -#-o file.out is output file from ansys where all text outputs will be redirected -#-p the name of license feature (aa_r=ANSYS Academic Research, ane3fl=Multiphysics(commercial), aa_r_dy=Academic AUTODYN) -/ansys_inc/v145/ansys/bin/ansys145 -b -dis -p aa_r -i input.dat -o file.out -machines $hl -dir $WORK_DIR -``` - -Header of the pbs file (above) is common and description can be find on [this site](../../resources-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. - -Working directory has to be created before sending pbs job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common APDL file which is attached to the ansys solver via parameter -i - -**License** should be selected by parameter -p. Licensed products are the following: aa_r (ANSYS **Academic** Research), ane3fl (ANSYS Multiphysics)-**Commercial**, aa_r_dy (ANSYS **Academic** AUTODYN) [More about licensing here](licensing/) diff --git a/docs.it4i/salomon/software/ansys/ansys.md b/docs.it4i/salomon/software/ansys/ansys.md deleted file mode 100644 index d7e0f2e1444ddc77dd861a4cce4eef06b4c78a6c..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/ansys/ansys.md +++ /dev/null @@ -1,15 +0,0 @@ -# Overview of ANSYS Products - -**[SVS FEM](http://www.svsfem.cz/)** as **[ANSYS Channel partner](http://www.ansys.com/)** for Czech Republic provided all ANSYS licenses for ANSELM cluster and supports of all ANSYS Products (Multiphysics, Mechanical, MAPDL, CFX, Fluent, Maxwell, LS-DYNA...) to IT staff and ANSYS users. If you are challenging to problem of ANSYS functionality contact please [hotline@svsfem.cz](mailto:hotline@svsfem.cz?subject=Ostrava%20-%20ANSELM) - -Anselm provides as commercial as academic variants. Academic variants are distinguished by "**Academic...**" word in the name of license or by two letter preposition "**aa\_**" in the license feature name. Change of license is realized on command line respectively directly in user's pbs file (see individual products). [More about licensing here](licensing/) - -To load the latest version of any ANSYS product (Mechanical, Fluent, CFX, MAPDL,...) load the module: - -```console -$ ml ansys -``` - -ANSYS supports interactive regime, but due to assumed solution of extremely difficult tasks it is not recommended. - -If user needs to work in interactive regime we recommend to configure the RSM service on the client machine which allows to forward the solution to the Anselm directly from the client's Workbench project (see ANSYS RSM service). diff --git a/docs.it4i/salomon/software/chemistry/nwchem.md b/docs.it4i/salomon/software/chemistry/nwchem.md deleted file mode 100644 index add429da99d2044e2ddaa64d29350e766c558bc2..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/chemistry/nwchem.md +++ /dev/null @@ -1,43 +0,0 @@ -# NWChem - -## Introduction - -NWChem aims to provide its users with computational chemistry tools that are scalable both in their ability to treat large scientific computational chemistry problems efficiently, and in their use of available parallel computing resources from high-performance parallel supercomputers to conventional workstation clusters. - -[Homepage](http://www.nwchem-sw.org/index.php/Main_Page) - -## Installed Versions - -The following versions are currently installed: - -* NWChem/6.3.revision2-2013-10-17-Python-2.7.8, current release. Compiled with Intel compilers, MKL and Intel MPI -* NWChem/6.5.revision26243-intel-2015b-2014-09-10-Python-2.7.8 - -For a current list of installed versions, execute: - -```console -$ ml av NWChem -``` - -The recommend to use version 6.5. Version 6.3 fails on Salomon nodes with accelerator, because it attempts to communicate over scif0 interface. In 6.5 this is avoided by setting ARMCI_OPENIB_DEVICE=mlx4_0, this setting is included in the module. - -## Running - - NWChem is compiled for parallel MPI execution. Normal procedure for MPI jobs applies. Sample jobscript : - -```bash - #PBS -A IT4I-0-0 - #PBS -q qprod - #PBS -l select=1:ncpus=24:mpiprocs=24 - - cd $PBS_O_WORKDIR - module add NWChem/6.5.revision26243-intel-2015b-2014-09-10-Python-2.7.8 - mpirun nwchem h2o.nw -``` - -## Options - -Please refer to [the documentation](http://www.nwchem-sw.org/index.php/Release62:Top-level) and in the input file set the following directives : - -* MEMORY : controls the amount of memory NWChem will use -* SCRATCH_DIR : set this to a directory in [SCRATCH filesystem](../../storage/storage/) (or run the calculation completely in a scratch directory). For certain calculations, it might be advisable to reduce I/O by forcing "direct" mode, eg. "scf direct" diff --git a/docs.it4i/salomon/software/debuggers/valgrind.md b/docs.it4i/salomon/software/debuggers/valgrind.md deleted file mode 100644 index 188f98502862effe90495934c6288aa64b042318..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/debuggers/valgrind.md +++ /dev/null @@ -1,265 +0,0 @@ -# Valgrind - -## About Valgrind - -Valgrind is an open-source tool, used mainly for debuggig memory-related problems, such as memory leaks, use of uninitalized memory etc. in C/C++ applications. The toolchain was however extended over time with more functionality, such as debugging of threaded applications, cache profiling, not limited only to C/C++. - -Valgind is an extremely useful tool for debugging memory errors such as [off-by-one](http://en.wikipedia.org/wiki/Off-by-one_error). Valgrind uses a virtual machine and dynamic recompilation of binary code, because of that, you can expect that programs being debugged by Valgrind run 5-100 times slower. - -The main tools available in Valgrind are : - -* **Memcheck**, the original, must used and default tool. Verifies memory access in you program and can detect use of unitialized memory, out of bounds memory access, memory leaks, double free, etc. -* **Massif**, a heap profiler. -* **Hellgrind** and **DRD** can detect race conditions in multi-threaded applications. -* **Cachegrind**, a cache profiler. -* **Callgrind**, a callgraph analyzer. -* For a full list and detailed documentation, please refer to the [official Valgrind documentation](http://valgrind.org/docs/). - -## Installed Versions - -There are two versions of Valgrind available on the cluster. - -* Version 3.8.1, installed by operating system vendor in /usr/bin/valgrind. This version is available by default, without the need to load any module. This version however does not provide additional MPI support. Also, it does not support AVX2 instructions, debugging of an AVX2-enabled executable with this version will fail -* Version 3.11.0 built by ICC with support for Intel MPI, available in module Valgrind/3.11.0-intel-2015b. After loading the module, this version replaces the default valgrind. -* Version 3.11.0 built by GCC with support for Open MPI, module Valgrind/3.11.0-foss-2015b - -## Usage - -Compile the application which you want to debug as usual. It is advisable to add compilation flags -g (to add debugging information to the binary so that you will see original source code lines in the output) and -O0 (to disable compiler optimizations). - -For example, lets look at this C code, which has two problems: - -```cpp - #include <stdlib.h> - - void f(void) - { - int* x = malloc(10 * sizeof(int)); - x[10] = 0; // problem 1: heap block overrun - } // problem 2: memory leak -- x not freed - - int main(void) - { - f(); - return 0; - } -``` - -Now, compile it with Intel compiler: - -```console -$ module add intel -$ icc -g valgrind-example.c -o valgrind-example -``` - -Now, lets run it with Valgrind. The syntax is: - -valgrind [valgrind options] < your program binary > [your program options] - -If no Valgrind options are specified, Valgrind defaults to running Memcheck tool. Please refer to the Valgrind documentation for a full description of command line options. - -```console -$ valgrind ./valgrind-example - ==12652== Memcheck, a memory error detector - ==12652== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. - ==12652== Using Valgrind-3.9.0 and LibVEX; rerun with -h for copyright info - ==12652== Command: ./valgrind-example - ==12652== - ==12652== Invalid write of size 4 - ==12652== at 0x40053E: f (valgrind-example.c:6) - ==12652== by 0x40054E: main (valgrind-example.c:11) - ==12652== Address 0x5861068 is 0 bytes after a block of size 40 alloc'd - ==12652== at 0x4C27AAA: malloc (vg_replace_malloc.c:291) - ==12652== by 0x400528: f (valgrind-example.c:5) - ==12652== by 0x40054E: main (valgrind-example.c:11) - ==12652== - ==12652== - ==12652== HEAP SUMMARY: - ==12652== in use at exit: 40 bytes in 1 blocks - ==12652== total heap usage: 1 allocs, 0 frees, 40 bytes allocated - ==12652== - ==12652== LEAK SUMMARY: - ==12652== definitely lost: 40 bytes in 1 blocks - ==12652== indirectly lost: 0 bytes in 0 blocks - ==12652== possibly lost: 0 bytes in 0 blocks - ==12652== still reachable: 0 bytes in 0 blocks - ==12652== suppressed: 0 bytes in 0 blocks - ==12652== Rerun with --leak-check=full to see details of leaked memory - ==12652== - ==12652== For counts of detected and suppressed errors, rerun with: -v - ==12652== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 6 from 6) -``` - -In the output we can see that Valgrind has detected both errors - the off-by-one memory access at line 5 and a memory leak of 40 bytes. If we want a detailed analysis of the memory leak, we need to run Valgrind with --leak-check=full option: - -```console -$ valgrind --leak-check=full ./valgrind-example - ==23856== Memcheck, a memory error detector - ==23856== Copyright (C) 2002-2010, and GNU GPL'd, by Julian Seward et al. - ==23856== Using Valgrind-3.6.0 and LibVEX; rerun with -h for copyright info - ==23856== Command: ./valgrind-example - ==23856== - ==23856== Invalid write of size 4 - ==23856== at 0x40067E: f (valgrind-example.c:6) - ==23856== by 0x40068E: main (valgrind-example.c:11) - ==23856== Address 0x66e7068 is 0 bytes after a block of size 40 alloc'd - ==23856== at 0x4C26FDE: malloc (vg_replace_malloc.c:236) - ==23856== by 0x400668: f (valgrind-example.c:5) - ==23856== by 0x40068E: main (valgrind-example.c:11) - ==23856== - ==23856== - ==23856== HEAP SUMMARY: - ==23856== in use at exit: 40 bytes in 1 blocks - ==23856== total heap usage: 1 allocs, 0 frees, 40 bytes allocated - ==23856== - ==23856== 40 bytes in 1 blocks are definitely lost in loss record 1 of 1 - ==23856== at 0x4C26FDE: malloc (vg_replace_malloc.c:236) - ==23856== by 0x400668: f (valgrind-example.c:5) - ==23856== by 0x40068E: main (valgrind-example.c:11) - ==23856== - ==23856== LEAK SUMMARY: - ==23856== definitely lost: 40 bytes in 1 blocks - ==23856== indirectly lost: 0 bytes in 0 blocks - ==23856== possibly lost: 0 bytes in 0 blocks - ==23856== still reachable: 0 bytes in 0 blocks - ==23856== suppressed: 0 bytes in 0 blocks - ==23856== - ==23856== For counts of detected and suppressed errors, rerun with: -v - ==23856== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 6 from 6) -``` - -Now we can see that the memory leak is due to the malloc() at line 6. - -## Usage With MPI - -Although Valgrind is not primarily a parallel debugger, it can be used to debug parallel applications as well. When launching your parallel applications, prepend the valgrind command. For example: - -```console -$ mpirun -np 4 valgrind myapplication -``` - -The default version without MPI support will however report a large number of false errors in the MPI library, such as: - -```console - ==30166== Conditional jump or move depends on uninitialised value(s) - ==30166== at 0x4C287E8: strlen (mc_replace_strmem.c:282) - ==30166== by 0x55443BD: I_MPI_Processor_model_number (init_interface.c:427) - ==30166== by 0x55439E0: I_MPI_Processor_arch_code (init_interface.c:171) - ==30166== by 0x558D5AE: MPID_nem_impi_init_shm_configuration (mpid_nem_impi_extensions.c:1091) - ==30166== by 0x5598F4C: MPID_nem_init_ckpt (mpid_nem_init.c:566) - ==30166== by 0x5598B65: MPID_nem_init (mpid_nem_init.c:489) - ==30166== by 0x539BD75: MPIDI_CH3_Init (ch3_init.c:64) - ==30166== by 0x5578743: MPID_Init (mpid_init.c:193) - ==30166== by 0x554650A: MPIR_Init_thread (initthread.c:539) - ==30166== by 0x553369F: PMPI_Init (init.c:195) - ==30166== by 0x4008BD: main (valgrind-example-mpi.c:18) -``` - -so it is better to use the MPI-enabled valgrind from module. The MPI versions requires library: - -$EBROOTVALGRIND/lib/valgrind/libmpiwrap-amd64-linux.so - -which must be included in the LD_PRELOAD environment variable. - -Lets look at this MPI example: - -```cpp - #include <stdlib.h> - #include <mpi.h> - - int main(int argc, char *argv[]) - { - int *data = malloc(sizeof(int)*99); - - MPI_Init(&argc, &argv); - MPI_Bcast(data, 100, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Finalize(); - - return 0; - } -``` - -There are two errors - use of uninitialized memory and invalid length of the buffer. Lets debug it with valgrind : - -```console -$ module add intel impi -$ mpiicc -g valgrind-example-mpi.c -o valgrind-example-mpi -$ module add Valgrind/3.11.0-intel-2015b -$ mpirun -np 2 -env LD_PRELOAD $EBROOTVALGRIND/lib/valgrind/libmpiwrap-amd64-linux.so valgrind ./valgrind-example-mpi -``` - -Prints this output : (note that there is output printed for every launched MPI process) - -```console - ==31318== Memcheck, a memory error detector - ==31318== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. - ==31318== Using Valgrind-3.9.0 and LibVEX; rerun with -h for copyright info - ==31318== Command: ./valgrind-example-mpi - ==31318== - ==31319== Memcheck, a memory error detector - ==31319== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. - ==31319== Using Valgrind-3.9.0 and LibVEX; rerun with -h for copyright info - ==31319== Command: ./valgrind-example-mpi - ==31319== - valgrind MPI wrappers 31319: Active for pid 31319 - valgrind MPI wrappers 31319: Try MPIWRAP_DEBUG=help for possible options - valgrind MPI wrappers 31318: Active for pid 31318 - valgrind MPI wrappers 31318: Try MPIWRAP_DEBUG=help for possible options - ==31319== Unaddressable byte(s) found during client check request - ==31319== at 0x4E35974: check_mem_is_addressable_untyped (libmpiwrap.c:960) - ==31319== by 0x4E5D0FE: PMPI_Bcast (libmpiwrap.c:908) - ==31319== by 0x400911: main (valgrind-example-mpi.c:20) - ==31319== Address 0x69291cc is 0 bytes after a block of size 396 alloc'd - ==31319== at 0x4C27AAA: malloc (vg_replace_malloc.c:291) - ==31319== by 0x4007BC: main (valgrind-example-mpi.c:8) - ==31319== - ==31318== Uninitialised byte(s) found during client check request - ==31318== at 0x4E3591D: check_mem_is_defined_untyped (libmpiwrap.c:952) - ==31318== by 0x4E5D06D: PMPI_Bcast (libmpiwrap.c:908) - ==31318== by 0x400911: main (valgrind-example-mpi.c:20) - ==31318== Address 0x6929040 is 0 bytes inside a block of size 396 alloc'd - ==31318== at 0x4C27AAA: malloc (vg_replace_malloc.c:291) - ==31318== by 0x4007BC: main (valgrind-example-mpi.c:8) - ==31318== - ==31318== Unaddressable byte(s) found during client check request - ==31318== at 0x4E3591D: check_mem_is_defined_untyped (libmpiwrap.c:952) - ==31318== by 0x4E5D06D: PMPI_Bcast (libmpiwrap.c:908) - ==31318== by 0x400911: main (valgrind-example-mpi.c:20) - ==31318== Address 0x69291cc is 0 bytes after a block of size 396 alloc'd - ==31318== at 0x4C27AAA: malloc (vg_replace_malloc.c:291) - ==31318== by 0x4007BC: main (valgrind-example-mpi.c:8) - ==31318== - ==31318== - ==31318== HEAP SUMMARY: - ==31318== in use at exit: 3,172 bytes in 67 blocks - ==31318== total heap usage: 191 allocs, 124 frees, 81,203 bytes allocated - ==31318== - ==31319== - ==31319== HEAP SUMMARY: - ==31319== in use at exit: 3,172 bytes in 67 blocks - ==31319== total heap usage: 175 allocs, 108 frees, 48,435 bytes allocated - ==31319== - ==31318== LEAK SUMMARY: - ==31318== definitely lost: 408 bytes in 3 blocks - ==31318== indirectly lost: 256 bytes in 1 blocks - ==31318== possibly lost: 0 bytes in 0 blocks - ==31318== still reachable: 2,508 bytes in 63 blocks - ==31318== suppressed: 0 bytes in 0 blocks - ==31318== Rerun with --leak-check=full to see details of leaked memory - ==31318== - ==31318== For counts of detected and suppressed errors, rerun with: -v - ==31318== Use --track-origins=yes to see where uninitialised values come from - ==31318== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 4 from 4) - ==31319== LEAK SUMMARY: - ==31319== definitely lost: 408 bytes in 3 blocks - ==31319== indirectly lost: 256 bytes in 1 blocks - ==31319== possibly lost: 0 bytes in 0 blocks - ==31319== still reachable: 2,508 bytes in 63 blocks - ==31319== suppressed: 0 bytes in 0 blocks - ==31319== Rerun with --leak-check=full to see details of leaked memory - ==31319== - ==31319== For counts of detected and suppressed errors, rerun with: -v - ==31319== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 4 from 4) -``` - -We can see that Valgrind has reported use of unitialised memory on the master process (which reads the array to be broadcasted) and use of unaddresable memory on both processes. diff --git a/docs.it4i/salomon/software/machine-learning/introduction.md b/docs.it4i/salomon/software/machine-learning/introduction.md deleted file mode 100644 index 4c5aba660e41441d4779fcf53cffa09c6b35b1d4..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/machine-learning/introduction.md +++ /dev/null @@ -1,19 +0,0 @@ -# Machine Learning - -This section overviews machine learning frameworks and libraries available on the Salomon cluster. - -## TensorFlow - -Load TensorFlow module: - -```console -$ ml Tensorflow -``` - -Test module: - -```console -$ ml Tensorflow -``` - -Read more about available versions at the [TensorFlow page](tensorflow). diff --git a/docs.it4i/salomon/software/machine-learning/tensorflow.md b/docs.it4i/salomon/software/machine-learning/tensorflow.md deleted file mode 100644 index 271cde01e96c589371482cec853d127c3f170089..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/machine-learning/tensorflow.md +++ /dev/null @@ -1,60 +0,0 @@ -# TensorFlow - -TensorFlow is an open-source software library for machine intelligence. - -## TensorFlow modules - -Salomon provides three different TensorFlow modules: - * Tensorflow/1.1.0 - * Tensorflow/1.2.0-GCC-7.1.0-2.28 - * Tensorflow/1.2.0-intel-2017.05-mkl - -### Tensorflow/1.1.0 (not recommended) - -TensorFlow 1.1 build. - -```console -$ ml Tensorflow/1.1.0 -``` - -This module was built with: - * GCC/4.9.3 - * Python/3.6.1 - -### Tensorflow/1.2.0-GCC-7.1.0-2.28 (default, recommended) - -TensorFlow 1.2 with SIMD support. TensorFlow build taking advantage of the Salomon CPU architecture. - -```console -$ ml Tensorflow/1.2.0-GCC-7.1.0-2.28 -``` - -This module was built with: - * GCC/7.1.0-2.28 - * Python/3.6.1 - * protobuf/3.2.0-GCC-7.1.0-2.28-Python-3.6.1 - -### Tensorflow/1.2.0-intel-2017.05-mkl - -TensorFlow 1.2 with MKL support. - -```console -$ ml Tensorflow/1.2.0-intel-2017.05-mkl -``` - -This module was built with: - * icc/2017.4.196-GCC-7.1.0-2.28 - * Python/3.6.1 - * protobuf/3.2.0-GCC-7.1.0-2.28-Python-3.6.1 - -## TensorFlow application example - -After loading one of the available TensorFlow modules, you can check the functionality running the following python script. - -```python -import tensorflow as tf - -c = tf.constant('Hello World!') -sess = tf.Session() -print(sess.run(c)) -``` diff --git a/docs.it4i/salomon/software/mpi/Running_OpenMPI.md b/docs.it4i/salomon/software/mpi/Running_OpenMPI.md deleted file mode 100644 index e2633236ac6624c7a41ed56496bacb9795158901..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/mpi/Running_OpenMPI.md +++ /dev/null @@ -1,203 +0,0 @@ -# Running OpenMPI - -## OpenMPI Program Execution - -The OpenMPI programs may be executed only via the PBS Workload manager, by entering an appropriate queue. On the cluster, the **OpenMPI 1.8.6** is OpenMPI based MPI implementation. - -### Basic Usage - -Use the mpiexec to run the OpenMPI code. - -Example: - -```console -$ qsub -q qexp -l select=4:ncpus=24 -I - qsub: waiting for job 15210.isrv5 to start - qsub: job 15210.isrv5 ready -$ pwd - /home/username -$ ml OpenMPI -$ mpiexec -pernode ./helloworld_mpi.x - Hello world! from rank 0 of 4 on host r1i0n17 - Hello world! from rank 1 of 4 on host r1i0n5 - Hello world! from rank 2 of 4 on host r1i0n6 - Hello world! from rank 3 of 4 on host r1i0n7 -``` - -Please be aware, that in this example, the directive **-pernode** is used to run only **one task per node**, which is normally an unwanted behaviour (unless you want to run hybrid code with just one MPI and 24 OpenMP tasks per node). In normal MPI programs **omit the -pernode directive** to run up to 24 MPI tasks per each node. - -In this example, we allocate 4 nodes via the express queue interactively. We set up the openmpi environment and interactively run the helloworld_mpi.x program. -Note that the executable helloworld_mpi.x must be available within the same path on all nodes. This is automatically fulfilled on the /home and /scratch filesystem. - -You need to preload the executable, if running on the local ramdisk /tmp filesystem - -```console -$ pwd - /tmp/pbs.15210.isrv5 -$ mpiexec -pernode --preload-binary ./helloworld_mpi.x - Hello world! from rank 0 of 4 on host r1i0n17 - Hello world! from rank 1 of 4 on host r1i0n5 - Hello world! from rank 2 of 4 on host r1i0n6 - Hello world! from rank 3 of 4 on host r1i0n7 -``` - -In this example, we assume the executable helloworld_mpi.x is present on compute node r1i0n17 on ramdisk. We call the mpiexec whith the **--preload-binary** argument (valid for openmpi). The mpiexec will copy the executable from r1i0n17 to the /tmp/pbs.15210.isrv5 directory on r1i0n5, r1i0n6 and r1i0n7 and execute the program. - -MPI process mapping may be controlled by PBS parameters. - -The mpiprocs and ompthreads parameters allow for selection of number of running MPI processes per node as well as number of OpenMP threads per MPI process. - -### One MPI Process Per Node - -Follow this example to run one MPI process per node, 24 threads per process. - -```console -$ qsub -q qexp -l select=4:ncpus=24:mpiprocs=1:ompthreads=24 -I -$ ml OpenMPI -$ mpiexec --bind-to-none ./helloworld_mpi.x -``` - -In this example, we demonstrate recommended way to run an MPI application, using 1 MPI processes per node and 24 threads per socket, on 4 nodes. - -### Two MPI Processes Per Node - -Follow this example to run two MPI processes per node, 8 threads per process. Note the options to mpiexec. - -```console -$ qsub -q qexp -l select=4:ncpus=24:mpiprocs=2:ompthreads=12 -I -$ ml OpenMPI -$ mpiexec -bysocket -bind-to-socket ./helloworld_mpi.x -``` - -In this example, we demonstrate recommended way to run an MPI application, using 2 MPI processes per node and 12 threads per socket, each process and its threads bound to a separate processor socket of the node, on 4 nodes - -### 24 MPI Processes Per Node - -Follow this example to run 24 MPI processes per node, 1 thread per process. Note the options to mpiexec. - -```console -$ qsub -q qexp -l select=4:ncpus=24:mpiprocs=24:ompthreads=1 -I -$ ml OpenMPI -$ mpiexec -bycore -bind-to-core ./helloworld_mpi.x -``` - -In this example, we demonstrate recommended way to run an MPI application, using 24 MPI processes per node, single threaded. Each process is bound to separate processor core, on 4 nodes. - -### OpenMP Thread Affinity - -!!! note - Important! Bind every OpenMP thread to a core! - -In the previous two examples with one or two MPI processes per node, the operating system might still migrate OpenMP threads between cores. You might want to avoid this by setting these environment variable for GCC OpenMP: - -```console -$ export GOMP_CPU_AFFINITY="0-23" -``` - -or this one for Intel OpenMP: - -```console -$ export KMP_AFFINITY=granularity=fine,compact,1,0 -``` - -As of OpenMP 4.0 (supported by GCC 4.9 and later and Intel 14.0 and later) the following variables may be used for Intel or GCC: - -```console -$ export OMP_PROC_BIND=true -$ export OMP_PLACES=cores -``` - -## OpenMPI Process Mapping and Binding - -The mpiexec allows for precise selection of how the MPI processes will be mapped to the computational nodes and how these processes will bind to particular processor sockets and cores. - -MPI process mapping may be specified by a hostfile or rankfile input to the mpiexec program. Altough all implementations of MPI provide means for process mapping and binding, following examples are valid for the openmpi only. - -### Hostfile - -Example hostfile - -```console - r1i0n17.smc.salomon.it4i.cz - r1i0n5.smc.salomon.it4i.cz - r1i0n6.smc.salomon.it4i.cz - r1i0n7.smc.salomon.it4i.cz -``` - -Use the hostfile to control process placement - -```console -$ mpiexec -hostfile hostfile ./helloworld_mpi.x - Hello world! from rank 0 of 4 on host r1i0n17 - Hello world! from rank 1 of 4 on host r1i0n5 - Hello world! from rank 2 of 4 on host r1i0n6 - Hello world! from rank 3 of 4 on host r1i0n7 -``` - -In this example, we see that ranks have been mapped on nodes according to the order in which nodes show in the hostfile - -### Rankfile - -Exact control of MPI process placement and resource binding is provided by specifying a rankfile - -Appropriate binding may boost performance of your application. - -Example rankfile - -```console - rank 0=r1i0n7.smc.salomon.it4i.cz slot=1:0,1 - rank 1=r1i0n6.smc.salomon.it4i.cz slot=0:* - rank 2=r1i0n5.smc.salomon.it4i.cz slot=1:1-2 - rank 3=r1i0n17.smc.salomon slot=0:1,1:0-2 - rank 4=r1i0n6.smc.salomon.it4i.cz slot=0:*,1:* -``` - -This rankfile assumes 5 ranks will be running on 4 nodes and provides exact mapping and binding of the processes to the processor sockets and cores - -Explanation: -rank 0 will be bounded to r1i0n7, socket1 core0 and core1 -rank 1 will be bounded to r1i0n6, socket0, all cores -rank 2 will be bounded to r1i0n5, socket1, core1 and core2 -rank 3 will be bounded to r1i0n17, socket0 core1, socket1 core0, core1, core2 -rank 4 will be bounded to r1i0n6, all cores on both sockets - -```console - $ mpiexec -n 5 -rf rankfile --report-bindings ./helloworld_mpi.x - [r1i0n17:11180] MCW rank 3 bound to socket 0[core 1] socket 1[core 0-2]: [. B . . . . . . . . . .][B B B . . . . . . . . .] (slot list 0:1,1:0-2) - [r1i0n7:09928] MCW rank 0 bound to socket 1[core 0-1]: [. . . . . . . . . . . .][B B . . . . . . . . . .] (slot list 1:0,1) - [r1i0n6:10395] MCW rank 1 bound to socket 0[core 0-7]: [B B B B B B B B B B B B][. . . . . . . . . . . .] (slot list 0:*) - [r1i0n5:10406] MCW rank 2 bound to socket 1[core 1-2]: [. . . . . . . . . . . .][. B B . . . . . . . . .] (slot list 1:1-2) - [r1i0n6:10406] MCW rank 4 bound to socket 0[core 0-7] socket 1[core 0-7]: [B B B B B B B B B B B B][B B B B B B B B B B B B] (slot list 0:*,1:*) - Hello world! from rank 3 of 5 on host r1i0n17 - Hello world! from rank 1 of 5 on host r1i0n6 - Hello world! from rank 0 of 5 on host r1i0n7 - Hello world! from rank 4 of 5 on host r1i0n6 - Hello world! from rank 2 of 5 on host r1i0n5 -``` - -In this example we run 5 MPI processes (5 ranks) on four nodes. The rankfile defines how the processes will be mapped on the nodes, sockets and cores. The **--report-bindings** option was used to print out the actual process location and bindings. Note that ranks 1 and 4 run on the same node and their core binding overlaps. - -It is users responsibility to provide correct number of ranks, sockets and cores. - -### Bindings Verification - -In all cases, binding and threading may be verified by executing for example: - -```console -$ mpiexec -bysocket -bind-to-socket --report-bindings echo -$ mpiexec -bysocket -bind-to-socket numactl --show -$ mpiexec -bysocket -bind-to-socket echo $OMP_NUM_THREADS -``` - -## Changes in OpenMPI 1.8 - -Some options have changed in OpenMPI version 1.8. - -| version 1.6.5 | version 1.8.1 | -| ---------------- | ------------------- | -| --bind-to-none | --bind-to none | -| --bind-to-core | --bind-to core | -| --bind-to-socket | --bind-to socket | -| -bysocket | --map-by socket | -| -bycore | --map-by core | -| -pernode | --map-by ppr:1:node | diff --git a/docs.it4i/salomon/software/mpi/mpi4py-mpi-for-python.md b/docs.it4i/salomon/software/mpi/mpi4py-mpi-for-python.md deleted file mode 100644 index f957f5c1439272e3b65d069e68d055e52c4cc0b8..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/mpi/mpi4py-mpi-for-python.md +++ /dev/null @@ -1,109 +0,0 @@ -# MPI4Py (MPI for Python) - -OpenMPI interface to Python - -## Introduction - -MPI for Python provides bindings of the Message Passing Interface (MPI) standard for the Python programming language, allowing any Python program to exploit multiple processors. - -This package is constructed on top of the MPI-1/2 specifications and provides an object oriented interface which closely follows MPI-2 C++ bindings. It supports point-to-point (sends, receives) and collective (broadcasts, scatters, gathers) communications of any picklable Python object, as well as optimized communications of Python object exposing the single-segment buffer interface (NumPy arrays, builtin bytes/string/array objects). - -On Salomon MPI4Py is available in standard Python modules. - -## Modules - -MPI4Py is build for OpenMPI. Before you start with MPI4Py you need to load Python and OpenMPI modules. You can use toolchain, that loads Python and OpenMPI at once. - -```console -$ ml av Python/ ---------------------------------------- /apps/modules/lang ------------------------- - Python/2.7.8-intel-2015b Python/2.7.11-intel-2016a Python/3.5.1-intel-2017.00 - Python/2.7.11-intel-2017a Python/2.7.9-foss-2015b Python/2.7.9-intel-2015b - Python/2.7.11-foss-2016a Python/3.5.2-foss-2016a Python/3.5.1 - Python/2.7.9-foss-2015g Python/3.4.3-intel-2015b Python/2.7.9 - Python/2.7.11-intel-2015b Python/3.5.2 - -$ ml av OpenMPI/ ---------------------------------------- /apps/modules/mpi -------------------------- -OpenMPI/1.8.6-GCC-4.4.7-system OpenMPI/1.8.8-GNU-4.9.3-2.25 OpenMPI/1.10.1-GCC-4.9.3-2.25 -OpenMPI/1.8.6-GNU-5.1.0-2.25 OpenMPI/1.8.8-GNU-5.1.0-2.25 OpenMPI/1.10.1-GNU-4.9.3-2.25 - OpenMPI/1.8.8-iccifort-2015.3.187-GNU-4.9.3-2.25 OpenMPI/2.0.2-GCC-6.3.0-2.27 -``` - -!!! Warning "" - * modules Python/x.x.x-intel... - intel MPI - * modules Python/x.x.x-foss... - OpenMPI - * modules Python/x.x.x - without MPI - -## Execution - -You need to import MPI to your python program. Include the following line to the python script: - -```console - from mpi4py import MPI -``` - -The MPI4Py enabled python programs [execute as any other OpenMPI](Running_OpenMPI/) code.The simpliest way is to run - -```console -$ mpiexec python <script>.py -``` - -For example - -```console -$ mpiexec python hello_world.py -``` - -## Examples - -### Hello World! - -```cpp - from mpi4py import MPI - - comm = MPI.COMM_WORLD - - print "Hello! I'm rank %d from %d running in total..." % (comm.rank, comm.size) - - comm.Barrier() # wait for everybody to synchronize -``` - -### Collective Communication With NumPy Arrays - -```cpp - from __future__ import division - from mpi4py import MPI - import numpy as np - - comm = MPI.COMM_WORLD - - print("-"*78) - print(" Running on %d cores" % comm.size) - print("-"*78) - - comm.Barrier() - - # Prepare a vector of N=5 elements to be broadcasted... - N = 5 - if comm.rank == 0: - A = np.arange(N, dtype=np.float64) # rank 0 has proper data - else: - A = np.empty(N, dtype=np.float64) # all other just an empty array - - # Broadcast A from rank 0 to everybody - comm.Bcast( [A, MPI.DOUBLE] ) - - # Everybody should now have the same... - print "[%02d] %s" % (comm.rank, A) -``` - -Execute the above code as: - -```console -$ qsub -q qexp -l select=4:ncpus=24:mpiprocs=24:ompthreads=1 -I -$ ml Python/2.7.9-foss-2015g - $ mpiexec --map-by core --bind-to core python hello_world.py -``` - -In this example, we run MPI4Py enabled code on 4 nodes, 24 cores per node (total of 96 processes), each python process is bound to a different core. More examples and documentation can be found on [MPI for Python webpage](https://pypi.python.org/pypi/mpi4py). diff --git a/docs.it4i/salomon/software/numerical-languages/introduction.md b/docs.it4i/salomon/software/numerical-languages/introduction.md deleted file mode 100644 index 13ba67071a136612568b6772104f0c8c5430ba40..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/numerical-languages/introduction.md +++ /dev/null @@ -1,40 +0,0 @@ -# Numerical languages - -Interpreted languages for numerical computations and analysis - -## Introduction - -This section contains a collection of high-level interpreted languages, primarily intended for numerical computations. - -## Matlab - -MATLAB®^ is a high-level language and interactive environment for numerical computation, visualization, and programming. - -```console -$ ml MATLAB -$ matlab -``` - -Read more at the [Matlab page](matlab/). - -## Octave - -GNU Octave is a high-level interpreted language, primarily intended for numerical computations. The Octave language is quite similar to Matlab so that most programs are easily portable. - -```console -$ ml Octave -$ octave -``` - -Read more at the [Octave page](octave/). - -## R - -The R is an interpreted language and environment for statistical computing and graphics. - -```console -$ ml R -$ R -``` - -Read more at the [R page](r/). diff --git a/docs.it4i/salomon/software/numerical-languages/octave.md b/docs.it4i/salomon/software/numerical-languages/octave.md deleted file mode 100644 index 5c679dd1b87e587965d802f2845997b755254fa2..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/numerical-languages/octave.md +++ /dev/null @@ -1,56 +0,0 @@ -# Octave - -GNU Octave is a high-level interpreted language, primarily intended for numerical computations. It provides capabilities for the numerical solution of linear and nonlinear problems, and for performing other numerical experiments. It also provides extensive graphics capabilities for data visualization and manipulation. Octave is normally used through its interactive command line interface, but it can also be used to write non-interactive programs. The Octave language is quite similar to Matlab so that most programs are easily portable. Read more on <http://www.gnu.org/software/octave/> - -Two versions of octave are available on the cluster, via module - -| Status | Version | module | -| ---------- | ------------ | ------ | -| **Stable** | Octave 3.8.2 | Octave | - -```console -$ ml Octave -``` - -The octave on the cluster is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many octave kernels, notably the linear algebra subroutines. Octave runs these heavy calculation kernels without any penalty. By default, octave would parallelize to 24 threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. - -To run octave interactively, log in with ssh -X parameter for X11 forwarding. Run octave: - -```console -$ octave -``` - -To run octave in batch mode, write an octave script, then write a bash jobscript and execute via the qsub command. By default, octave will use 16 threads when running MKL kernels. - -```bash - #!/bin/bash - - # change to local scratch directory - mkdir -p /scratch/work/user/$USER/$PBS_JOBID - cd /scratch/work/user/$USER/$PBS_JOBID || exit - - # copy input file to scratch - cp $PBS_O_WORKDIR/octcode.m . - - # load octave module - module load Octave - - # execute the calculation - octave -q --eval octcode > output.out - - # copy output file to home - cp output.out $PBS_O_WORKDIR/. - - #exit - exit -``` - -This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in octcode.m file, outputs in output.out file. See the single node jobscript example in the [Job execution section](../../). - -The octave c compiler mkoctfile calls the GNU gcc 4.8.1 for compiling native c code. This is very useful for running native c subroutines in octave environment. - -```console -$ mkoctfile -v -``` - -Octave may use MPI for interprocess communication This functionality is currently not supported on the cluster cluster. In case you require the octave interface to MPI, please contact our [cluster support](https://support.it4i.cz/rt/). diff --git a/docs.it4i/salomon/storage.md b/docs.it4i/salomon/storage.md index dacdcd184d00762d7e9b1ffd771990e157ca2180..af1c35a79ffd25b3199e83090f1da6cca44545e0 100644 --- a/docs.it4i/salomon/storage.md +++ b/docs.it4i/salomon/storage.md @@ -9,9 +9,9 @@ All login and compute nodes may access same data on shared file systems. Compute ## Policy (In a Nutshell) !!! note - \* Use [HOME](#home) for your most valuable data and programs. - \* Use [WORK](#work) for your large project files. - \* Use [TEMP](#temp) for large scratch data. + \* Use [HOME](#home) for your most valuable data and programs. + \* Use [WORK](#work) for your large project files. + \* Use [TEMP](#temp) for large scratch data. !!! warning Do not use for [archiving](#archiving)! @@ -155,7 +155,7 @@ Entries: 6 ``` In this example, we view current size limits and space occupied on the /home and /scratch filesystem, for a particular user executing the command. -Note that limits are imposed also on number of objects (files, directories, links, etc...) that are allowed to create. +Note that limits are imposed also on number of objects (files, directories, links, etc...) that are allowed to create. To have a better understanding of where the space is exactly used, you can use following command to find out. @@ -281,7 +281,7 @@ The TEMP workspace resides on SCRATCH file system. The TEMP workspace accesspoin Users are advised to save the necessary data from the TEMP workspace to HOME or WORK after the calculations and clean up the scratch files. -!!! warning +!!! warning Files on the TEMP file system that are **not accessed for more than 90 days** will be automatically **deleted**. The TEMP workspace is hosted on SCRATCH file system. The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. Default stripe size is 1 MB, stripe count is 1. There are 54 OSTs dedicated for the SCRATCH file system. diff --git a/docs.it4i/snippets/modules_matrix_search.md b/docs.it4i/snippets/modules_matrix_search.md index b936364e15723838e1609be9cfb57df526496e85..4d91e6caf59329ca9d772f50226f78ff32038913 100644 --- a/docs.it4i/snippets/modules_matrix_search.md +++ b/docs.it4i/snippets/modules_matrix_search.md @@ -36,6 +36,6 @@ $("#searchInput").keyup(function () { "color": "black" }); }).css({ - "color": "#C0C0C0" + "color": "#C0C0C0" }); </script> diff --git a/docs.it4i/snippets/resource_accounting.md b/docs.it4i/snippets/resource_accounting.md index c4f7b6a8fd07a0b5f0aacee189552131ff7ffd44..138493c95d0dfd5d6e8205cc176fbe4b6e92480d 100644 --- a/docs.it4i/snippets/resource_accounting.md +++ b/docs.it4i/snippets/resource_accounting.md @@ -23,13 +23,11 @@ All jobs are accounted in normalized core-hours, using factor F valid at the tim | Anselm | 0.65 | 2017-09-11 to 2018-06-01 | The accounting runs whenever the computational cores are allocated via the PBS Pro workload manager (the qsub command), regardless of whether -the cores are actually used for any calculation. - - +the cores are actually used for any calculation. !!! note **The allocations are requested/granted in normalized core-hours NCH.** - + !!! warning Whenever the term core-hour is used in this documentation, we mean the normalized core-hour, NCH. @@ -65,4 +63,5 @@ Legend WCH = Wall-clock Core Hour NCH = Normalized Core Hour ``` -The **it4ifree** command is a part of it4i.portal.clients package, located here: <https://pypi.python.org/pypi/it4i.portal.clients> \ No newline at end of file + +The **it4ifree** command is a part of it4i.portal.clients package, located here: <https://pypi.python.org/pypi/it4i.portal.clients> diff --git a/docs.it4i/anselm/software/ansys/ansys-cfx.md b/docs.it4i/software/ansys/ansys-cfx.md similarity index 88% rename from docs.it4i/anselm/software/ansys/ansys-cfx.md rename to docs.it4i/software/ansys/ansys-cfx.md index b816f026430e7573ddffb81d5ced15770994435b..111c5e3663dd2102e1026f65fae2aeb1fbd8f599 100644 --- a/docs.it4i/anselm/software/ansys/ansys-cfx.md +++ b/docs.it4i/software/ansys/ansys-cfx.md @@ -47,7 +47,7 @@ echo Machines: $hl /ansys_inc/v145/CFX/bin/cfx5solve -def input.def -size 4 -size-ni 4x -part-large -start-method "Platform MPI Distributed Parallel" -par-dist $hl -P aa_r ``` -Header of the PBS file (above) is common and description can be find on [this site](../../job-submission-and-execution/). SVS FEM recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. +Header of the PBS file (above) is common and description can be find on [this site](../../anselm/job-submission-and-execution/). SVS FEM recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. Working directory has to be created before sending PBS job into the queue. Input file should be in working directory or full path to input file has to be specified. >Input file has to be defined by common CFX def file which is attached to the cfx solver via parameter -def diff --git a/docs.it4i/anselm/software/ansys/ansys-fluent.md b/docs.it4i/software/ansys/ansys-fluent.md similarity index 92% rename from docs.it4i/anselm/software/ansys/ansys-fluent.md rename to docs.it4i/software/ansys/ansys-fluent.md index 4521c758ed7def8e6795f9de97ecb0d698cd9dc9..74326f978d9a088b0ff523fda89d6d502b363dd0 100644 --- a/docs.it4i/anselm/software/ansys/ansys-fluent.md +++ b/docs.it4i/software/ansys/ansys-fluent.md @@ -38,7 +38,7 @@ NCORES=`wc -l $PBS_NODEFILE |awk '{print $1}'` /ansys_inc/v145/fluent/bin/fluent 3d -t$NCORES -cnf=$PBS_NODEFILE -g -i fluent.jou ``` -Header of the pbs file (above) is common and description can be find on [this site](../../resources-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. +Header of the pbs file (above) is common and description can be find on [this site](../../anselm/resources-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. Working directory has to be created before sending pbs job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common Fluent journal file which is attached to the Fluent solver via parameter -i fluent.jou @@ -151,12 +151,12 @@ Fluent could be run in parallel only under Academic Research license. To do so t ANSLIC_ADMIN Utility will be run - + - + - + ANSYS Academic Research license should be moved up to the top of the list. - + diff --git a/docs.it4i/anselm/software/ansys/ansys-ls-dyna.md b/docs.it4i/software/ansys/ansys-ls-dyna.md similarity index 87% rename from docs.it4i/anselm/software/ansys/ansys-ls-dyna.md rename to docs.it4i/software/ansys/ansys-ls-dyna.md index af46af93a30600c440e4e52cb5fdbd1edb677660..46a8ed726fb4da82bb743a71a98aa5e4b9f88132 100644 --- a/docs.it4i/anselm/software/ansys/ansys-ls-dyna.md +++ b/docs.it4i/software/ansys/ansys-ls-dyna.md @@ -50,6 +50,6 @@ echo Machines: $hl /ansys_inc/v145/ansys/bin/ansys145 -dis -lsdynampp i=input.k -machines $hl ``` -Header of the PBS file (above) is common and description can be find on [this site](../../job-submission-and-execution/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. +Header of the PBS file (above) is common and description can be find on [this site](../../anselm/job-submission-and-execution/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. Working directory has to be created before sending PBS job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common LS-DYNA .**k** file which is attached to the ANSYS solver via parameter i= diff --git a/docs.it4i/anselm/software/ansys/ansys-mechanical-apdl.md b/docs.it4i/software/ansys/ansys-mechanical-apdl.md similarity index 87% rename from docs.it4i/anselm/software/ansys/ansys-mechanical-apdl.md rename to docs.it4i/software/ansys/ansys-mechanical-apdl.md index cdaac19ff664acbcd79c8c234ff30ff54cf06cad..b33f77104100f5504e297484a586cb9a0a7e0201 100644 --- a/docs.it4i/anselm/software/ansys/ansys-mechanical-apdl.md +++ b/docs.it4i/software/ansys/ansys-mechanical-apdl.md @@ -49,7 +49,7 @@ echo Machines: $hl /ansys_inc/v145/ansys/bin/ansys145 -b -dis -p aa_r -i input.dat -o file.out -machines $hl -dir $WORK_DIR ``` -Header of the PBS file (above) is common and description can be found on [this site](../../resource-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allow to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. +Header of the PBS file (above) is common and description can be found on [this site](../../anselm/resources-allocation-policy/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allow to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. Working directory has to be created before sending PBS job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common APDL file which is attached to the ANSYS solver via parameter -i diff --git a/docs.it4i/anselm/software/ansys/ansys.md b/docs.it4i/software/ansys/ansys.md similarity index 96% rename from docs.it4i/anselm/software/ansys/ansys.md rename to docs.it4i/software/ansys/ansys.md index 24b8b1c09721168d11a214f00a2ee50a109e6c20..d1def39cd2775933d56ce89a58a2cc3c87ea388c 100644 --- a/docs.it4i/anselm/software/ansys/ansys.md +++ b/docs.it4i/software/ansys/ansys.md @@ -2,7 +2,7 @@ **[SVS FEM](http://www.svsfem.cz/)** as **[ANSYS Channel partner](http://www.ansys.com/)** for Czech Republic provided all ANSYS licenses for ANSELM cluster and supports of all ANSYS Products (Multiphysics, Mechanical, MAPDL, CFX, Fluent, Maxwell, LS-DYNA...) to IT staff and ANSYS users. If you are challenging to problem of ANSYS functionality contact please [hotline@svsfem.cz](mailto:hotline@svsfem.cz?subject=Ostrava%20-%20ANSELM) -Anselm provides commercial as well as academic variants. Academic variants are distinguished by "**Academic...**" word in the name of license or by two letter preposition "**aa\_**" in the license feature name. Change of license is realized on command line respectively directly in user's PBS file (see individual products). [More about licensing here](ansys/licensing/) +Anselm provides commercial as well as academic variants. Academic variants are distinguished by "**Academic...**" word in the name of license or by two letter preposition "**aa\_**" in the license feature name. Change of license is realized on command line respectively directly in user's PBS file (see individual products). [More about licensing here](licensing/) To load the latest version of any ANSYS product (Mechanical, Fluent, CFX, MAPDL,...) load the module: diff --git a/docs.it4i/salomon/software/ansys/licensing.md b/docs.it4i/software/ansys/licensing.md similarity index 100% rename from docs.it4i/salomon/software/ansys/licensing.md rename to docs.it4i/software/ansys/licensing.md diff --git a/docs.it4i/anselm/software/ansys/ls-dyna.md b/docs.it4i/software/ansys/ls-dyna.md similarity index 85% rename from docs.it4i/anselm/software/ansys/ls-dyna.md rename to docs.it4i/software/ansys/ls-dyna.md index 063bcf245e7b74781c953eebb309adfad5c0e48d..3bd9deef62ba5ac1456c992f3a7ed74ddc034eff 100644 --- a/docs.it4i/anselm/software/ansys/ls-dyna.md +++ b/docs.it4i/software/ansys/ls-dyna.md @@ -30,6 +30,6 @@ module load lsdyna /apps/engineering/lsdyna/lsdyna700s i=input.k ``` -Header of the PBS file (above) is common and description can be find on [this site](../../job-submission-and-execution/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. +Header of the PBS file (above) is common and description can be find on [this site](../../anselm/job-submission-and-execution/). [SVS FEM](http://www.svsfem.cz) recommends to utilize sources by keywords: nodes, ppn. These keywords allows to address directly the number of nodes (computers) and cores (ppn) which will be utilized in the job. Also the rest of code assumes such structure of allocated resources. Working directory has to be created before sending PBS job into the queue. Input file should be in working directory or full path to input file has to be specified. Input file has to be defined by common LS-DYNA **.k** file which is attached to the LS-DYNA solver via parameter i= diff --git a/docs.it4i/salomon/software/ansys/setting-license-preferences.md b/docs.it4i/software/ansys/setting-license-preferences.md similarity index 82% rename from docs.it4i/salomon/software/ansys/setting-license-preferences.md rename to docs.it4i/software/ansys/setting-license-preferences.md index b3f594d14863cde6aaa28f7a5139223d30a7d95b..74234c028d9068a3978dd5a4bd0ef70c8d47be8a 100644 --- a/docs.it4i/salomon/software/ansys/setting-license-preferences.md +++ b/docs.it4i/software/ansys/setting-license-preferences.md @@ -12,12 +12,12 @@ $ANSYSLIC_DIR/lic_admin/anslic_admin ANSLIC_ADMIN Utility will be run - + - + - + ANSYS Academic Research license should be moved up to the top or down to the bottom of the list. - + diff --git a/docs.it4i/salomon/software/ansys/workbench.md b/docs.it4i/software/ansys/workbench.md similarity index 98% rename from docs.it4i/salomon/software/ansys/workbench.md rename to docs.it4i/software/ansys/workbench.md index 1b138ccd09fa64fd6ccbafbcb40ff14b2959bad4..0cd523838cc5a4367d3439354e20b1a3caa9fca7 100644 --- a/docs.it4i/salomon/software/ansys/workbench.md +++ b/docs.it4i/software/ansys/workbench.md @@ -4,7 +4,7 @@ It is possible to run Workbench scripts in batch mode. You need to configure solvers of individual components to run in parallel mode. Open your project in Workbench. Then, for example, in Mechanical, go to Tools - Solve Process Settings ... - + Enable Distribute Solution checkbox and enter number of cores (eg. 48 to run on two Salomon nodes). If you want the job to run on more then 1 node, you must also provide a so called MPI appfile. In the Additional Command Line Arguments input field, enter: diff --git a/docs.it4i/salomon/software/chemistry/INCAR b/docs.it4i/software/chemistry/INCAR similarity index 100% rename from docs.it4i/salomon/software/chemistry/INCAR rename to docs.it4i/software/chemistry/INCAR diff --git a/docs.it4i/salomon/software/chemistry/KPOINTS b/docs.it4i/software/chemistry/KPOINTS similarity index 100% rename from docs.it4i/salomon/software/chemistry/KPOINTS rename to docs.it4i/software/chemistry/KPOINTS diff --git a/docs.it4i/salomon/software/chemistry/POSCAR b/docs.it4i/software/chemistry/POSCAR similarity index 100% rename from docs.it4i/salomon/software/chemistry/POSCAR rename to docs.it4i/software/chemistry/POSCAR diff --git a/docs.it4i/salomon/software/chemistry/POTCAR b/docs.it4i/software/chemistry/POTCAR similarity index 100% rename from docs.it4i/salomon/software/chemistry/POTCAR rename to docs.it4i/software/chemistry/POTCAR diff --git a/docs.it4i/salomon/software/chemistry/gofree-cond1.sh b/docs.it4i/software/chemistry/gofree-cond1.sh similarity index 100% rename from docs.it4i/salomon/software/chemistry/gofree-cond1.sh rename to docs.it4i/software/chemistry/gofree-cond1.sh diff --git a/docs.it4i/salomon/software/chemistry/molpro.md b/docs.it4i/software/chemistry/molpro.md similarity index 75% rename from docs.it4i/salomon/software/chemistry/molpro.md rename to docs.it4i/software/chemistry/molpro.md index ab53760cda8c5efa186e93d7ab9d4b4032979f53..2fb61643afd70154ca9870375bae76ff27188805 100644 --- a/docs.it4i/salomon/software/chemistry/molpro.md +++ b/docs.it4i/software/chemistry/molpro.md @@ -35,29 +35,29 @@ Molpro is compiled for parallel execution using MPI and OpenMP. By default, Molp !!! note The OpenMP parallelization in Molpro is limited and has been observed to produce limited scaling. We therefore recommend to use MPI parallelization only. This can be achieved by passing option mpiprocs=16:ompthreads=1 to PBS. -You are advised to use the -d option to point to a directory in [SCRATCH filesystem](../../storage/storage/). Molpro can produce a large amount of temporary data during its run, and it is important that these are placed in the fast scratch filesystem. +You are advised to use the -d option to point to a directory in [SCRATCH file system - Salomon](../../salomon/storage/). Molpro can produce a large amount of temporary data during its run, and it is important that these are placed in the fast scratch file system. ### Example jobscript ```bash - #PBS -A IT4I-0-0 - #PBS -q qprod - #PBS -l select=1:ncpus=16:mpiprocs=16:ompthreads=1 +#PBS -A IT4I-0-0 +#PBS -q qprod +#PBS -l select=1:ncpus=16:mpiprocs=16:ompthreads=1 - cd $PBS_O_WORKDIR +cd $PBS_O_WORKDIR - # load Molpro module - module add molpro +# load Molpro module +module add molpro - # create a directory in the SCRATCH filesystem - mkdir -p /scratch/$USER/$PBS_JOBID +# create a directory in the SCRATCH filesystem +mkdir -p /scratch/$USER/$PBS_JOBID - # copy an example input - cp /apps/chem/molpro/2010.1/molprop_2010_1_Linux_x86_64_i8/examples/caffeine_opt_diis.com . +# copy an example input +cp /apps/chem/molpro/2010.1/molprop_2010_1_Linux_x86_64_i8/examples/caffeine_opt_diis.com . - # run Molpro with default options - molpro -d /scratch/$USER/$PBS_JOBID caffeine_opt_diis.com +# run Molpro with default options +molpro -d /scratch/$USER/$PBS_JOBID caffeine_opt_diis.com - # delete scratch directory - rm -rf /scratch/$USER/$PBS_JOBID +# delete scratch directory +rm -rf /scratch/$USER/$PBS_JOBID ``` diff --git a/docs.it4i/software/chemistry/nwchem.md b/docs.it4i/software/chemistry/nwchem.md new file mode 100644 index 0000000000000000000000000000000000000000..41c2006e414243c979e987bfcbcfb85e932df72c --- /dev/null +++ b/docs.it4i/software/chemistry/nwchem.md @@ -0,0 +1,36 @@ +# NWChem + +## Introduction + +NWChem aims to provide its users with computational chemistry tools that are scalable both in their ability to treat large scientific computational chemistry problems efficiently, and in their use of available parallel computing resources from high-performance parallel supercomputers to conventional workstation clusters. + +[Homepage](http://www.nwchem-sw.org/index.php/Main_Page) + +## Installed Versions + +For a current list of installed versions, execute: + +```console +$ ml av NWChem +``` + +## Running + + NWChem is compiled for parallel MPI execution. Normal procedure for MPI jobs applies. Sample jobscript (for Salomon on 24 threads): + +```bash +#PBS -A IT4I-0-0 +#PBS -q qprod +#PBS -l select=1:ncpus=24:mpiprocs=24 + +cd $PBS_O_WORKDIR +module add NWChem +mpirun nwchem h2o.nw +``` + +## Options + +Please refer to [the documentation](http://www.nwchem-sw.org/index.php/Release62:Top-level) and in the input file set the following directives : + +* MEMORY : controls the amount of memory NWChem will use +* SCRATCH_DIR : set this to a directory in [SCRATCH filesystem - Salomon](../../salomon/storage/) (or run the calculation completely in a scratch directory). For certain calculations, it might be advisable to reduce I/O by forcing "direct" mode, eg. "scf direct" diff --git a/docs.it4i/software/orca.md b/docs.it4i/software/chemistry/orca.md similarity index 99% rename from docs.it4i/software/orca.md rename to docs.it4i/software/chemistry/orca.md index 00a695f488102918c05406a749508291acae77c6..5712a6d891cc40578ecca371442669b68557faf4 100644 --- a/docs.it4i/software/orca.md +++ b/docs.it4i/software/chemistry/orca.md @@ -114,7 +114,7 @@ ml ORCA/4.0.1.2 ${EBROOTORCA}/orca orca_parallel.inp ``` -!!! note +!!! note When running ORCA in parallel, ORCA should **NOT** be started with mpirun: e.g. mpirun -np 4 orca etc. like many MPI programs and has to be called with full pathname. Submit this job to the queue and see the output file. diff --git a/docs.it4i/salomon/software/chemistry/phono3py.md b/docs.it4i/software/chemistry/phono3py.md similarity index 97% rename from docs.it4i/salomon/software/chemistry/phono3py.md rename to docs.it4i/software/chemistry/phono3py.md index 5f366baa1e6acb0cb948cd473a9acb65243691c8..884f25cbf89a4b43afa3afc9a02fdbec171c9383 100644 --- a/docs.it4i/salomon/software/chemistry/phono3py.md +++ b/docs.it4i/software/chemistry/phono3py.md @@ -18,7 +18,7 @@ $ ml phono3py ### Calculating Force Constants -One needs to calculate second order and third order force constants using the diamond structure of silicon stored in [POSCAR](poscar-si) (the same form as in VASP) using single displacement calculations within supercell. +One needs to calculate second order and third order force constants using the diamond structure of silicon stored in [POSCAR](POSCAR) (the same form as in VASP) using single displacement calculations within supercell. ```console $ cat POSCAR diff --git a/docs.it4i/salomon/software/chemistry/prepare.sh b/docs.it4i/software/chemistry/prepare.sh similarity index 100% rename from docs.it4i/salomon/software/chemistry/prepare.sh rename to docs.it4i/software/chemistry/prepare.sh diff --git a/docs.it4i/salomon/software/chemistry/run.sh b/docs.it4i/software/chemistry/run.sh similarity index 100% rename from docs.it4i/salomon/software/chemistry/run.sh rename to docs.it4i/software/chemistry/run.sh diff --git a/docs.it4i/salomon/software/chemistry/submit.sh b/docs.it4i/software/chemistry/submit.sh similarity index 100% rename from docs.it4i/salomon/software/chemistry/submit.sh rename to docs.it4i/software/chemistry/submit.sh diff --git a/docs.it4i/salomon/software/compilers.md b/docs.it4i/software/compilers.md similarity index 92% rename from docs.it4i/salomon/software/compilers.md rename to docs.it4i/software/compilers.md index a49aa8eb4dfa2d832572e8c225b6ceccdd84bc82..8e0d8dee43fd99bb058dc3d5f8b125c1f74891a8 100644 --- a/docs.it4i/salomon/software/compilers.md +++ b/docs.it4i/software/compilers.md @@ -1,6 +1,6 @@ # Compilers -Available compilers, including GNU, INTEL and UPC compilers +## Available compilers, including GNU, INTEL and UPC compilers There are several compilers for different programming languages available on the cluster: @@ -8,6 +8,7 @@ There are several compilers for different programming languages available on the * Fortran 77/90/95/HPF * Unified Parallel C * Java +* NVIDIA CUDA (only on Anselm) The C/C++ and Fortran compilers are provided by: @@ -23,11 +24,11 @@ Commercial licenses: ## Intel Compilers -For information about the usage of Intel Compilers and other Intel products, please read the [Intel Parallel studio](intel-suite/) page. +For information about the usage of Intel Compilers and other Intel products, please read the [Intel Parallel studio](intel-suite/intel-compilers/) page. -## PGI Compilers +## PGI Compilers (only on Salomon) -The Portland Group Cluster Development Kit (PGI CDK) is available. +The Portland Group Cluster Development Kit (PGI CDK) is available on Salomon. ```console $ module load PGI @@ -132,7 +133,7 @@ For more information see the man pages. To use the Berkley UPC compiler and runtime environment to run the binaries use the module bupc ```console -$ module add BerkeleyUPC/2.16.2-gompi-2015b +$ module add BerkeleyUPC/2.16.2-gompi-2015b # on Anselm: ml bupc $ upcc -version ``` @@ -174,7 +175,7 @@ To run the example with 5 threads issue $ upcrun -n 5 ./hello.upc.x ``` -To run the example on two compute nodes using all 48 cores, with 48 threads, issue +To run the example on two compute nodes using all 48 cores, with 48 threads, issue (on Anselm compute on 32 cores) ```console $ qsub -I -q qprod -A PROJECT_ID -l select=2:ncpus=24 @@ -190,4 +191,4 @@ For information how to use Java (runtime and/or compiler), please read the [Java ## NVIDIA CUDA -For information how to work with NVIDIA CUDA, please read the [NVIDIA CUDA page](../../anselm/software/nvidia-cuda/). +For information how to work with NVIDIA CUDA, please read the [NVIDIA CUDA page](../anselm/software/nvidia-cuda/). diff --git a/docs.it4i/salomon/software/comsol/comsol-multiphysics.md b/docs.it4i/software/comsol/comsol-multiphysics.md similarity index 83% rename from docs.it4i/salomon/software/comsol/comsol-multiphysics.md rename to docs.it4i/software/comsol/comsol-multiphysics.md index 431294469311b408c9e023c17347cae239037622..c5170bfcffbafc2e8e744cf97ff1b7501f2c6b0b 100644 --- a/docs.it4i/salomon/software/comsol/comsol-multiphysics.md +++ b/docs.it4i/software/comsol/comsol-multiphysics.md @@ -16,14 +16,14 @@ COMSOL also allows an interface support for equation-based modelling of partial On the clusters COMSOL is available in the latest stable version. There are two variants of the release: -* **Non commercial** or so called >**EDU variant**>, which can be used for research and educational purposes. +* **Non commercial** or so called **EDU variant**, which can be used for research and educational purposes. -* **Commercial** or so called **COM variant**, which can used also for commercial activities. **COM variant** has only subset of features compared to the **EDU variant** available. More about licensing will be posted here soon. +* **Commercial** or so called **COM variant**, which can used also for commercial activities. **COM variant** has only subset of features compared to the **EDU variant** available. More about licensing [here](licensing-and-available-versions/). To load the of COMSOL load the module ```console -$ ml COMSOL/51-EDU +$ ml COMSOL ``` By default the **EDU variant** will be loaded. If user needs other version or variant, load the particular version. To obtain the list of available versions use @@ -32,7 +32,9 @@ By default the **EDU variant** will be loaded. If user needs other version or va $ ml av COMSOL ``` -If user needs to prepare COMSOL jobs in the interactive mode it is recommend to use COMSOL on the compute nodes via PBS Pro scheduler. In order run the COMSOL Desktop GUI on Windows is recommended to use the [Virtual Network Computing (VNC)](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). +If user needs to prepare COMSOL jobs in the interactive mode it is recommend to use COMSOL on the compute nodes via PBS Pro scheduler. In order run the COMSOL Desktop GUI on Windows is recommended to use the [Virtual Network Computing (VNC)](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). + +Example for Salomon: ```console $ xhost + @@ -50,7 +52,7 @@ To run COMSOL in batch mode, without the COMSOL Desktop GUI environment, user ca #PBS -N JOB_NAME #PBS -A PROJECT_ID -cd /scratch/work/user/$USER/ || exit +cd /scratch/work/user/$USER/ || exit # on Anselm use: /scratch/$USER echo Time is `date` echo Directory is `pwd` @@ -65,7 +67,7 @@ module load COMSOL ntask=$(wc -l $PBS_NODEFILE) -comsol -nn ${ntask} batch -configuration /tmp –mpiarg –rmk –mpiarg pbs -tmpdir /scratch/$USER/ -inputfile name_input_f.mph -outputfile name_output_f.mph -batchlog name_log_f.log +comsol -nn ${ntask} batch -configuration /tmp –mpiarg –rmk –mpiarg pbs -tmpdir /scratch/.../$USER/ -inputfile name_input_f.mph -outputfile name_output_f.mph -batchlog name_log_f.log ``` Working directory has to be created before sending the (comsol.pbs) job script into the queue. Input file (name_input_f.mph) has to be in working directory or full path to input file has to be specified. The appropriate path to the temp directory of the job has to be set by command option (-tmpdir). @@ -74,7 +76,7 @@ Working directory has to be created before sending the (comsol.pbs) job script i COMSOL is the software package for the numerical solution of the partial differential equations. LiveLink for MATLAB allows connection to the COMSOL API (Application Programming Interface) with the benefits of the programming language and computing environment of the MATLAB. -LiveLink for MATLAB is available in both **EDU** and **COM** **variant** of the COMSOL release. On the clusters 1 commercial (**COM**) license and the 5 educational (**EDU**) licenses of LiveLink for MATLAB (please see the [ISV Licenses](../../../anselm/software/isv_licenses/)) are available. Following example shows how to start COMSOL model from MATLAB via LiveLink in the interactive mode. +LiveLink for MATLAB is available in both **EDU** and **COM** **variant** of the COMSOL release. On the clusters 1 commercial (**COM**) license and the 5 educational (**EDU**) licenses of LiveLink for MATLAB (please see the [ISV Licenses](../isv_licenses/)) are available. Following example shows how to start COMSOL model from MATLAB via LiveLink in the interactive mode (on Anselm use 16 threads). ```console $ xhost + @@ -95,7 +97,7 @@ To run LiveLink for MATLAB in batch mode with (comsol_matlab.pbs) job script you #PBS -N JOB_NAME #PBS -A PROJECT_ID -cd /scratch/work/user/$USER || exit +cd /scratch/work/user/$USER || exit # on Anselm use: /scratch/$USER echo Time is `date` echo Directory is `pwd` @@ -115,4 +117,4 @@ cd /apps/cae/COMSOL/51/mli matlab -nodesktop -nosplash -r "mphstart; addpath /scratch/work/user/$USER/work; test_job" ``` -This example shows how to run Livelink for MATLAB with following configuration: 3 nodes and 16 cores per node. Working directory has to be created before submitting (comsol_matlab.pbs) job script into the queue. Input file (test_job.m) has to be in working directory or full path to input file has to be specified. The Matlab command option (-r ”mphstart”) created a connection with a COMSOL server using the default port number. +This example shows how to run Livelink for MATLAB with following configuration: 3 nodes and 24 cores per node. Working directory has to be created before submitting (comsol_matlab.pbs) job script into the queue. Input file (test_job.m) has to be in working directory or full path to input file has to be specified. The Matlab command option (-r ”mphstart”) created a connection with a COMSOL server using the default port number. diff --git a/docs.it4i/salomon/software/comsol/licensing-and-available-versions.md b/docs.it4i/software/comsol/licensing-and-available-versions.md similarity index 100% rename from docs.it4i/salomon/software/comsol/licensing-and-available-versions.md rename to docs.it4i/software/comsol/licensing-and-available-versions.md diff --git a/docs.it4i/salomon/software/debuggers/Introduction.md b/docs.it4i/software/debuggers/Introduction.md similarity index 100% rename from docs.it4i/salomon/software/debuggers/Introduction.md rename to docs.it4i/software/debuggers/Introduction.md diff --git a/docs.it4i/salomon/software/debuggers/aislinn.md b/docs.it4i/software/debuggers/aislinn.md similarity index 99% rename from docs.it4i/salomon/software/debuggers/aislinn.md rename to docs.it4i/software/debuggers/aislinn.md index 89cf7538016c004b1ba9058bcf148bbf0761eb50..2a945a04e8c74218b312c2ad9d6bdd08f1f54a5b 100644 --- a/docs.it4i/salomon/software/debuggers/aislinn.md +++ b/docs.it4i/software/debuggers/aislinn.md @@ -79,7 +79,7 @@ $ firefox report.html At the beginning of the report there are some basic summaries of the verification. In the second part (depicted in the following picture), the error is described. - + It shows us: diff --git a/docs.it4i/salomon/software/debuggers/allinea-ddt.md b/docs.it4i/software/debuggers/allinea-ddt.md similarity index 99% rename from docs.it4i/salomon/software/debuggers/allinea-ddt.md rename to docs.it4i/software/debuggers/allinea-ddt.md index 6e1f046f10fd2d521343a995cb59580440080a73..67bfdff184ed1244a154848de728d58f4c678c94 100644 --- a/docs.it4i/salomon/software/debuggers/allinea-ddt.md +++ b/docs.it4i/software/debuggers/allinea-ddt.md @@ -75,7 +75,7 @@ $ ddt test_debug A submission window that appears have a prefilled path to the executable to debug. You can select the number of MPI processors and/or OpenMP threads on which to run and press run. Command line arguments to a program can be entered to the "Arguments " box. - + To start the debugging directly without the submission window, user can specify the debugging and execution parameters from the command line. For example the number of MPI processes is set by option "-np 4". Skipping the dialog is done by "-start" option. To see the list of the "ddt" command line parameters, run "ddt --help". diff --git a/docs.it4i/salomon/software/debuggers/allinea-performance-reports.md b/docs.it4i/software/debuggers/allinea-performance-reports.md similarity index 96% rename from docs.it4i/salomon/software/debuggers/allinea-performance-reports.md rename to docs.it4i/software/debuggers/allinea-performance-reports.md index ead91a093c83ba9503f2be7ba702e698d7bca0df..14451cdb0a25a5704fcafb61b9b0a558b6df882c 100644 --- a/docs.it4i/salomon/software/debuggers/allinea-performance-reports.md +++ b/docs.it4i/software/debuggers/allinea-performance-reports.md @@ -28,7 +28,7 @@ Instead of [running your MPI program the usual way](../mpi/mpi/), use the the pe $ perf-report mpirun ./mympiprog.x ``` -The mpi program will run as usual. The perf-report creates two additional files, in \*.txt and \*.html format, containing the performance report. Note that demanding MPI codes should be run within [the queue system](../../job-submission-and-execution/). +The mpi program will run as usual. The perf-report creates two additional files, in \*.txt and \*.html format, containing the performance report. Note that demanding MPI codes should be run within [the queue system](../../anselm/job-submission-and-execution/). ## Example @@ -45,7 +45,7 @@ $ qsub -q qexp -l select=2:ppn=24:mpiprocs=24:ompthreads=1 -I Then we load the modules and run the program the usual way: ```console -$ ml intel +$ ml intel $ ml PerfReports/6.0 $ mpirun ./mympiprog.x ``` diff --git a/docs.it4i/anselm/software/debuggers/cube.md b/docs.it4i/software/debuggers/cube.md similarity index 95% rename from docs.it4i/anselm/software/debuggers/cube.md rename to docs.it4i/software/debuggers/cube.md index a7f88955e78159f5800a37e603f91fa09e3ccdbe..4edf6ea02a445e633315ae4448c72e2c74a72fae 100644 --- a/docs.it4i/anselm/software/debuggers/cube.md +++ b/docs.it4i/software/debuggers/cube.md @@ -10,7 +10,7 @@ CUBE is a graphical performance report explorer for displaying data from Score-P Each dimension is organized in a tree, for example the time performance metric is divided into Execution time and Overhead time, call path dimension is organized by files and routines in your source code etc. - + \*Figure 1. Screenshot of CUBE displaying data from Scalasca.\* @@ -18,7 +18,7 @@ Each node in the tree is colored by severity (the color scheme is displayed at t ## Installed Versions -Currently, there are two versions of CUBE 4.2.3 available as [modules](../../environment-and-modules/): +Currently, there are two versions of CUBE 4.2.3 available as [modules](../../modules-matrix/): * cube/4.2.3-gcc, compiled with GCC * cube/4.2.3-icc, compiled with Intel compiler diff --git a/docs.it4i/anselm/software/debuggers/intel-performance-counter-monitor.md b/docs.it4i/software/debuggers/intel-performance-counter-monitor.md similarity index 99% rename from docs.it4i/anselm/software/debuggers/intel-performance-counter-monitor.md rename to docs.it4i/software/debuggers/intel-performance-counter-monitor.md index b46b472b68577a3f0764199439de310a967a4bde..3373cc4eeb9c92ab49f1ee9e72005d70911d1f46 100644 --- a/docs.it4i/anselm/software/debuggers/intel-performance-counter-monitor.md +++ b/docs.it4i/software/debuggers/intel-performance-counter-monitor.md @@ -6,7 +6,7 @@ Intel PCM (Performance Counter Monitor) is a tool to monitor performance hardwar ## Installed Version -Currently installed version 2.6. To load the [module](../../environment-and-modules/), issue: +Currently installed version 2.6. To load the [module](../../modules-matrix/) issue: ```console $ ml intelpcm diff --git a/docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md b/docs.it4i/software/debuggers/intel-vtune-amplifier.md similarity index 99% rename from docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md rename to docs.it4i/software/debuggers/intel-vtune-amplifier.md index 192aece7e250dfb9b2938daebe83606a1f002b06..7da1fc0053cd8b7c9c62fe205b45397b083bc059 100644 --- a/docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md +++ b/docs.it4i/software/debuggers/intel-vtune-amplifier.md @@ -9,7 +9,7 @@ Intel *®* VTune™ Amplifier, part of Intel Parallel studio, is a GUI profiling * Low level specific counters, such as branch analysis and memory bandwidth * Power usage analysis - frequency and sleep states. - + ## Usage diff --git a/docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.html b/docs.it4i/software/debuggers/mympiprog_32p_2014-10-15_16-56.html similarity index 100% rename from docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.html rename to docs.it4i/software/debuggers/mympiprog_32p_2014-10-15_16-56.html diff --git a/docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.txt b/docs.it4i/software/debuggers/mympiprog_32p_2014-10-15_16-56.txt similarity index 100% rename from docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.txt rename to docs.it4i/software/debuggers/mympiprog_32p_2014-10-15_16-56.txt diff --git a/docs.it4i/anselm/software/debuggers/papi.md b/docs.it4i/software/debuggers/papi.md similarity index 99% rename from docs.it4i/anselm/software/debuggers/papi.md rename to docs.it4i/software/debuggers/papi.md index d03dd8354769895e3b7f8454f5a0dd613a626bc3..15b03837e4bb18d2b8b6357d60bb29574b30fdae 100644 --- a/docs.it4i/anselm/software/debuggers/papi.md +++ b/docs.it4i/software/debuggers/papi.md @@ -10,7 +10,7 @@ PAPI can be used with parallel as well as serial programs. ## Usage -To use PAPI, load [module](../../environment-and-modules/) papi: +To use PAPI, load [module](../../modules-matrix/) papi: ```console $ ml papi diff --git a/docs.it4i/anselm/software/debuggers/scalasca.md b/docs.it4i/software/debuggers/scalasca.md similarity index 96% rename from docs.it4i/anselm/software/debuggers/scalasca.md rename to docs.it4i/software/debuggers/scalasca.md index a7cd44b1d5236eb3e257a24f5a3cfbdb96e6b0f5..f8f1db9d0249bf7f25b448f3e017aadddff08181 100644 --- a/docs.it4i/anselm/software/debuggers/scalasca.md +++ b/docs.it4i/software/debuggers/scalasca.md @@ -8,7 +8,7 @@ Scalasca supports profiling of MPI, OpenMP and hybrid MPI+OpenMP applications. ## Installed Versions -There are currently two versions of Scalasca 2.0 [modules](../../environment-and-modules/) installed on Anselm: +There are currently two versions of Scalasca 2.0 [modules](../../modules-matrix/) installed on Anselm: * scalasca2/2.0-gcc-openmpi, for usage with [GNU Compiler](../compilers/) and [OpenMPI](../mpi/Running_OpenMPI/), * scalasca2/2.0-icc-impi, for usage with [Intel Compiler](../compilers/) and [Intel MPI](../mpi/running-mpich2/). @@ -43,7 +43,7 @@ Some notable Scalasca options are: * **-e <directory> Specify a directory to save the collected data to. By default, Scalasca saves the data to a directory with prefix scorep\_, followed by name of the executable and launch configuration.** !!! note - Scalasca can generate a huge amount of data, especially if tracing is enabled. Please consider saving the data to a [scratch directory](../../storage/storage/). + Scalasca can generate a huge amount of data, especially if tracing is enabled. Please consider saving the data to a [scratch directory](../../salomon/storage/). ### Analysis of Reports diff --git a/docs.it4i/anselm/software/debuggers/score-p.md b/docs.it4i/software/debuggers/score-p.md similarity index 69% rename from docs.it4i/anselm/software/debuggers/score-p.md rename to docs.it4i/software/debuggers/score-p.md index 3295933c45e6c7f8b7275a5bede4cef5064bd49f..186762617bc0ae39f624df391b0816044fc5f327 100644 --- a/docs.it4i/anselm/software/debuggers/score-p.md +++ b/docs.it4i/software/debuggers/score-p.md @@ -8,7 +8,7 @@ Score-P can be used as an instrumentation tool for [Scalasca](scalasca/). ## Installed Versions -There are currently two versions of Score-P version 1.2.6 [modules](../../environment-and-modules/) installed on Anselm : +There are currently two versions of Score-P version 1.2.6 [modules](../../modules-matrix/) installed on Anselm : * scorep/1.2.3-gcc-openmpi, for usage with [GNU Compiler](../compilers/) and [OpenMPI](../mpi/Running_OpenMPI/) * scorep/1.2.3-icc-impi, for usage with [Intel Compiler](../compilers/)> and [Intel MPI](../mpi/running-mpich2/)>. @@ -50,28 +50,28 @@ To use this kind of instrumentation, use scorep with switch --user. You will the An example in C/C++ : ```cpp - #include <scorep/SCOREP_User.h> - void foo() - { - SCOREP_USER_REGION_DEFINE( my_region_handle ) - // more declarations - SCOREP_USER_REGION_BEGIN( my_region_handle, "foo", SCOREP_USER_REGION_TYPE_COMMON ) - // do something - SCOREP_USER_REGION_END( my_region_handle ) - } +#include <scorep/SCOREP_User.h> +void foo() +{ + SCOREP_USER_REGION_DEFINE( my_region_handle ) + // more declarations + SCOREP_USER_REGION_BEGIN( my_region_handle, "foo", SCOREP_USER_REGION_TYPE_COMMON ) + // do something + SCOREP_USER_REGION_END( my_region_handle ) +} ``` and Fortran : -```cpp - #include "scorep/SCOREP_User.inc" - subroutine foo - SCOREP_USER_REGION_DEFINE( my_region_handle ) - ! more declarations - SCOREP_USER_REGION_BEGIN( my_region_handle, "foo", SCOREP_USER_REGION_TYPE_COMMON ) - ! do something - SCOREP_USER_REGION_END( my_region_handle ) - end subroutine foo +```fortran +#include "scorep/SCOREP_User.inc" +subroutine foo + SCOREP_USER_REGION_DEFINE( my_region_handle ) + ! more declarations + SCOREP_USER_REGION_BEGIN( my_region_handle, "foo", SCOREP_USER_REGION_TYPE_COMMON ) + ! do something + SCOREP_USER_REGION_END( my_region_handle ) +end subroutine foo ``` Please refer to the [documentation for description of the API](https://silc.zih.tu-dresden.de/scorep-current/pdf/scorep.pdf). @@ -83,35 +83,35 @@ This method uses POMP2 directives to mark regions to be instrumented. To use thi Example directives in C/C++ : ```cpp - void foo(...) +void foo(...) +{ + /* declarations */ + #pragma pomp inst begin(foo) + ... + if (<condition>) { - /* declarations */ - #pragma pomp inst begin(foo) - ... - if (<condition>) - { - #pragma pomp inst altend(foo) - return; - } - ... - #pragma pomp inst end(foo) + #pragma pomp inst altend(foo) + return; } + ... + #pragma pomp inst end(foo) +} ``` and in Fortran : -```cpp - subroutine foo(...) - !declarations - !POMP$ INST BEGIN(foo) - ... - if (<condition>) then - !POMP$ INST ALTEND(foo) - return - end if - ... - !POMP$ INST END(foo) - end subroutine foo +```fortran +subroutine foo(...) + !declarations + !POMP$ INST BEGIN(foo) + ... + if (<condition>) then + !POMP$ INST ALTEND(foo) + return + end if + ... + !POMP$ INST END(foo) +end subroutine foo ``` The directives are ignored if the program is compiled without Score-P. Again, please refer to the [documentation](https://silc.zih.tu-dresden.de/scorep-current/pdf/scorep.pdf) for a more elaborate description. diff --git a/docs.it4i/salomon/software/debuggers/total-view.md b/docs.it4i/software/debuggers/total-view.md similarity index 70% rename from docs.it4i/salomon/software/debuggers/total-view.md rename to docs.it4i/software/debuggers/total-view.md index 172836a05053e22bb4eb588a14f5a350d5b1f956..aebe91a523f00cc82fe566fed7f2102af9762509 100644 --- a/docs.it4i/salomon/software/debuggers/total-view.md +++ b/docs.it4i/software/debuggers/total-view.md @@ -13,7 +13,19 @@ On the cluster users can debug OpenMP or MPI code that runs up to 64 parallel pr Debugging of GPU accelerated codes is also supported. -You can check the status of the licenses [here](https://extranet.it4i.cz/rsweb/anselm/license/Totalview). +You can check the status of the licenses [here (Salomon)](https://extranet.it4i.cz/rsweb/anselm/license/Totalview) or type (Anselm): + +```console +$ cat /apps/user/licenses/totalview_features_state.txt + + # totalview + # ------------------------------------------------- + # FEATURE TOTAL USED AVAIL + # ------------------------------------------------- + TotalView_Team 64 0 64 + Replay 64 0 64 + CUDA 64 0 64 +``` ## Compiling Code to Run With TotalView @@ -28,7 +40,8 @@ Load all necessary modules to compile the code. For example: Load the TotalView module: ```console - ml TotalView/8.15.4-6-linux-x86-64 + ml TotalView + ml totalview ``` Compile the code: @@ -57,7 +70,7 @@ ssh -X username@salomon.it4i.cz Other options is to access login node using VNC. Please see the detailed information on how to use graphic user interface on Anselm. -From the login node an interactive session with X windows forwarding (-X option) can be started by following command: +From the login node an interactive session with X windows forwarding (-X option) can be started by following command (for Anselm use 16 threads): ```console $ qsub -I -X -A NONE-0-0 -q qexp -lselect=1:ncpus=24:mpiprocs=24,walltime=01:00:00 @@ -81,36 +94,40 @@ To debug a parallel code compiled with **OpenMPI** you need to setup your TotalV To be able to run parallel debugging procedure from the command line without stopping the debugger in the mpiexec source code you have to add the following function to your **~/.tvdrc** file. ```console - proc mpi_auto_run_starter {loaded_id} { - set starter_programs {mpirun mpiexec orterun} - set executable_name [TV::symbol get $loaded_id full_pathname] - set file_component [file tail $executable_name] - - if {[lsearch -exact $starter_programs $file_component] != -1} { - puts "*************************************" - puts "Automatically starting $file_component" - puts "*************************************" - dgo - } +proc mpi_auto_run_starter {loaded_id} { + set starter_programs {mpirun mpiexec orterun} + set executable_name [TV::symbol get $loaded_id full_pathname] + set file_component [file tail $executable_name] + + if {[lsearch -exact $starter_programs $file_component] != -1} { + puts "*************************************" + puts "Automatically starting $file_component" + puts "*************************************" + dgo } +} - # Append this function to TotalView's image load callbacks so that - # TotalView run this program automatically. +# Append this function to TotalView's image load callbacks so that +# TotalView run this program automatically. - dlappend TV::image_load_callbacks mpi_auto_run_starter +dlappend TV::image_load_callbacks mpi_auto_run_starter ``` The source code of this function can be also found in ```console -$ /apps/all/OpenMPI/1.10.1-GNU-4.9.3-2.25/etc/openmpi-totalview.tcl +$ /apps/all/OpenMPI/1.10.1-GNU-4.9.3-2.25/etc/openmpi-totalview.tcl #Salomon + +$ /apps/mpi/openmpi/intel/1.6.5/etc/openmpi-totalview.tcl #Anselm ``` You can also add only following line to you ~/.tvdrc file instead of the entire function: ```console -$ source /apps/all/OpenMPI/1.10.1-GNU-4.9.3-2.25/etc/openmpi-totalview.tcl +$ source /apps/all/OpenMPI/1.10.1-GNU-4.9.3-2.25/etc/openmpi-totalview.tcl #Salomon + +$ source /apps/mpi/openmpi/intel/1.6.5/etc/openmpi-totalview.tcl #Anselm ``` You need to do this step only once. See also [OpenMPI FAQ entry](https://www.open-mpi.org/faq/?category=running#run-with-tv) @@ -123,11 +140,11 @@ $ mpirun -tv -n 5 ./test_debug When following dialog appears click on "Yes" - + At this point the main TotalView GUI window will appear and you can insert the breakpoints and start debugging: - + ### Debugging a Parallel Code - Option 2 diff --git a/docs.it4i/anselm/software/debuggers/valgrind.md b/docs.it4i/software/debuggers/valgrind.md similarity index 88% rename from docs.it4i/anselm/software/debuggers/valgrind.md rename to docs.it4i/software/debuggers/valgrind.md index 0e381e945c86c1a53af181b8cb62194171535bee..b91eeb7a0642fb603a43029f577fb3310acee827 100644 --- a/docs.it4i/anselm/software/debuggers/valgrind.md +++ b/docs.it4i/software/debuggers/valgrind.md @@ -22,7 +22,13 @@ The main tools available in Valgrind are : There are two versions of Valgrind available on Anselm. * Version 3.6.0, installed by operating system vendor in /usr/bin/valgrind. This version is available by default, without the need to load any module. This version however does not provide additional MPI support. -* Version 3.9.0 with support for Intel MPI, available in [module](../../environment-and-modules/) valgrind/3.9.0-impi. After loading the module, this version replaces the default valgrind. +* Version 3.9.0 with support for Intel MPI, available in [module](../../modules-matrix/) valgrind/3.9.0-impi. After loading the module, this version replaces the default valgrind. + +There are two versions of Valgrind available on the Salomon. + +* Version 3.8.1, installed by operating system vendor in /usr/bin/valgrind. This version is available by default, without the need to load any module. This version however does not provide additional MPI support. Also, it does not support AVX2 instructions, debugging of an AVX2-enabled executable with this version will fail +* Version 3.11.0 built by ICC with support for Intel MPI, available in module Valgrind/3.11.0-intel-2015b. After loading the module, this version replaces the default valgrind. +* Version 3.11.0 built by GCC with support for Open MPI, module Valgrind/3.11.0-foss-2015b ## Usage @@ -31,19 +37,19 @@ Compile the application which you want to debug as usual. It is advisable to add For example, lets look at this C code, which has two problems : ```cpp - #include <stdlib.h> - - void f(void) - { - int* x = malloc(10 * sizeof(int)); - x[10] = 0; // problem 1: heap block overrun - } // problem 2: memory leak -- x not freed - - int main(void) - { - f(); - return 0; - } +#include <stdlib.h> + +void f(void) +{ + int* x = malloc(10 * sizeof(int)); + x[10] = 0; // problem 1: heap block overrun +} // problem 2: memory leak -- x not freed + +int main(void) +{ + f(); + return 0; +} ``` Now, compile it with Intel compiler : @@ -156,24 +162,29 @@ The default version without MPI support will however report a large number of fa ==30166== by 0x4008BD: main (valgrind-example-mpi.c:18) ``` -so it is better to use the MPI-enabled valgrind from module. The MPI version requires library /apps/tools/valgrind/3.9.0/impi/lib/valgrind/libmpiwrap-amd64-linux.so, which must be included in the LD_PRELOAD environment variable. +so it is better to use the MPI-enabled valgrind from module. The MPI version requires library: + +* Anselm: /apps/tools/valgrind/3.9.0/impi/lib/valgrind/libmpiwrap-amd64-linux.so +* Salomon: $EBROOTVALGRIND/lib/valgrind/libmpiwrap-amd64-linux.so + +which must be included in the LD_PRELOAD environment variable. Lets look at this MPI example : ```cpp - #include <stdlib.h> - #include <mpi.h> +#include <stdlib.h> +#include <mpi.h> - int main(int argc, char *argv[]) - { - int *data = malloc(sizeof(int)*99); +int main(int argc, char *argv[]) +{ + int *data = malloc(sizeof(int)*99); - MPI_Init(&argc, &argv); - MPI_Bcast(data, 100, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Finalize(); + MPI_Init(&argc, &argv); + MPI_Bcast(data, 100, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Finalize(); - return 0; - } + return 0; +} ``` There are two errors - use of uninitialized memory and invalid length of the buffer. Lets debug it with valgrind : diff --git a/docs.it4i/salomon/software/debuggers/vampir.md b/docs.it4i/software/debuggers/vampir.md similarity index 90% rename from docs.it4i/salomon/software/debuggers/vampir.md rename to docs.it4i/software/debuggers/vampir.md index 852374d229d2c4f4a2e4c612c85d25b1c121faf0..93d11f0ca571bd8dadd401b7f6173a3403683476 100644 --- a/docs.it4i/salomon/software/debuggers/vampir.md +++ b/docs.it4i/software/debuggers/vampir.md @@ -2,7 +2,7 @@ Vampir is a commercial trace analysis and visualisation tool. It can work with traces in OTF and OTF2 formats. It does not have the functionality to collect traces, you need to use a trace collection tool (such as [Score-P](score-p/)) first to collect the traces. - + ## Installed Versions diff --git a/docs.it4i/software/easybuild.md b/docs.it4i/software/easybuild.md index 25efb91214024c7b03d00fc8873d00745872f85e..df4ddd4cfb049857c0e1a4abca08bdbf661e4ba5 100644 --- a/docs.it4i/software/easybuild.md +++ b/docs.it4i/software/easybuild.md @@ -12,8 +12,8 @@ All builds and installations are performed at user level, so you don't need the EasyBuild relies on two main concepts - * Toolchains - * EasyConfig file (our easyconfigs is [here](https://code.it4i.cz/sccs/easyconfigs-it4i)) +* Toolchains +* EasyConfig file (our easyconfigs is [here](https://code.it4i.cz/sccs/easyconfigs-it4i)) Detailed documentations is available [here](http://easybuild.readthedocs.io). @@ -21,8 +21,8 @@ Detailed documentations is available [here](http://easybuild.readthedocs.io). A toolchain corresponds to a compiler and a set of libraries which are commonly used to build a software. The two main toolchains frequently used on the IT4Innovations clusters are the **foss** and **intel**. - * **foss** is based on the GCC compiler and on open-source libraries (OpenMPI, OpenBLAS, etc.). - * **intel** is based on the Intel compiler and on Intel libraries (Intel MPI, Intel Math Kernel Library, etc.). +* **foss** is based on the GCC compiler and on open-source libraries (OpenMPI, OpenBLAS, etc.). +* **intel** is based on the Intel compiler and on Intel libraries (Intel MPI, Intel Math Kernel Library, etc.). Additional details are available on [here](https://github.com/hpcugent/easybuild/wiki/Compiler-toolchains). diff --git a/docs.it4i/anselm/software/gpi2.md b/docs.it4i/software/gpi2.md similarity index 83% rename from docs.it4i/anselm/software/gpi2.md rename to docs.it4i/software/gpi2.md index 09241e15a96f7412f2e7652efda091d7868cd5d1..1de40bd8a592cf0d450a8744f704a767004b2b6a 100644 --- a/docs.it4i/anselm/software/gpi2.md +++ b/docs.it4i/software/gpi2.md @@ -12,6 +12,8 @@ The GPI-2, version 1.0.2 is available on Anselm via module gpi2: ```console $ ml gpi2 + +$ ml av GPI-2 # Salomon ``` The module sets up environment variables, required for linking and running GPI-2 enabled applications. This particular command loads the default module, which is gpi2/1.0.2 @@ -99,40 +101,40 @@ The gaspi_logger utility is used to view the output from all nodes except the ma Following is an example GPI-2 enabled code: ```cpp - #include <GASPI.h> - #include <stdlib.h> +#include <GASPI.h> +#include <stdlib.h> - void success_or_exit ( const char* file, const int line, const int ec) +void success_or_exit ( const char* file, const int line, const int ec) +{ + if (ec != GASPI_SUCCESS) { - if (ec != GASPI_SUCCESS) - { - gaspi_printf ("Assertion failed in %s[%i]:%dn", file, line, ec); - exit (1); - } + gaspi_printf ("Assertion failed in %s[%i]:%dn", file, line, ec); + exit (1); } +} - #define ASSERT(ec) success_or_exit (__FILE__, __LINE__, ec); +#define ASSERT(ec) success_or_exit (__FILE__, __LINE__, ec); - int main(int argc, char *argv[]) - { - gaspi_rank_t rank, num; - gaspi_return_t ret; +int main(int argc, char *argv[]) +{ + gaspi_rank_t rank, num; + gaspi_return_t ret; - /* Initialize GPI-2 */ - ASSERT( gaspi_proc_init(GASPI_BLOCK) ); + /* Initialize GPI-2 */ + ASSERT( gaspi_proc_init(GASPI_BLOCK) ); - /* Get ranks information */ - ASSERT( gaspi_proc_rank(&rank) ); - ASSERT( gaspi_proc_num(&num) ); + /* Get ranks information */ + ASSERT( gaspi_proc_rank(&rank) ); + ASSERT( gaspi_proc_num(&num) ); - gaspi_printf("Hello from rank %d of %dn", - rank, num); + gaspi_printf("Hello from rank %d of %dn", + rank, num); - /* Terminate */ - ASSERT( gaspi_proc_term(GASPI_BLOCK) ); + /* Terminate */ + ASSERT( gaspi_proc_term(GASPI_BLOCK) ); - return 0; - } + return 0; +} ``` Load modules and compile: diff --git a/docs.it4i/salomon/software/intel-suite/intel-advisor.md b/docs.it4i/software/intel-suite/intel-advisor.md similarity index 100% rename from docs.it4i/salomon/software/intel-suite/intel-advisor.md rename to docs.it4i/software/intel-suite/intel-advisor.md diff --git a/docs.it4i/salomon/software/intel-suite/intel-compilers.md b/docs.it4i/software/intel-suite/intel-compilers.md similarity index 100% rename from docs.it4i/salomon/software/intel-suite/intel-compilers.md rename to docs.it4i/software/intel-suite/intel-compilers.md diff --git a/docs.it4i/salomon/software/intel-suite/intel-debugger.md b/docs.it4i/software/intel-suite/intel-debugger.md similarity index 79% rename from docs.it4i/salomon/software/intel-suite/intel-debugger.md rename to docs.it4i/software/intel-suite/intel-debugger.md index 15788c798785390777016856b8ffcc111227c1d2..ac7cec6ad56acbc3705fcdc478531e2cade64c47 100644 --- a/docs.it4i/salomon/software/intel-suite/intel-debugger.md +++ b/docs.it4i/software/intel-suite/intel-debugger.md @@ -4,10 +4,10 @@ IDB is no longer available since Intel Parallel Studio 2015 ## Debugging Serial Applications -The intel debugger version 13.0 is available, via module intel. The debugger works for applications compiled with C and C++ compiler and the ifort fortran 77/90/95 compiler. The debugger provides java GUI environment. Use [X display](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/) for running the GUI. +The intel debugger version is available, via module intel/13.5.192. The debugger works for applications compiled with C and C++ compiler and the ifort fortran 77/90/95 compiler. The debugger provides java GUI environment. Use [X display](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/) for running the GUI. ```console -$ ml intel +$ ml intel/13.5.192 $ ml Java $ idb ``` @@ -18,12 +18,12 @@ The debugger may run in text mode. To debug in text mode, use $ idbc ``` -To debug on the compute nodes, module intel must be loaded. The GUI on compute nodes may be accessed using the same way as in [the GUI section](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/) +To debug on the compute nodes, module intel must be loaded. The GUI on compute nodes may be accessed using the same way as in [the GUI section](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/) Example: ```console -$ qsub -q qexp -l select=1:ncpus=24 -X -I +$ qsub -q qexp -l select=1:ncpus=24 -X -I # use 16 threads for Anselm qsub: waiting for job 19654.srv11 to start qsub: job 19654.srv11 ready $ ml intel @@ -40,7 +40,7 @@ In this example, we allocate 1 full compute node, compile program myprog.c with ### Small Number of MPI Ranks -For debugging small number of MPI ranks, you may execute and debug each rank in separate xterm terminal (do not forget the [X display](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/)). Using Intel MPI, this may be done in following way: +For debugging small number of MPI ranks, you may execute and debug each rank in separate xterm terminal (do not forget the [X display](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/)). Using Intel MPI, this may be done in following way: ```console $ qsub -q qexp -l select=2:ncpus=24 -X -I diff --git a/docs.it4i/salomon/software/intel-suite/intel-inspector.md b/docs.it4i/software/intel-suite/intel-inspector.md similarity index 100% rename from docs.it4i/salomon/software/intel-suite/intel-inspector.md rename to docs.it4i/software/intel-suite/intel-inspector.md diff --git a/docs.it4i/salomon/software/intel-suite/intel-integrated-performance-primitives.md b/docs.it4i/software/intel-suite/intel-integrated-performance-primitives.md similarity index 53% rename from docs.it4i/salomon/software/intel-suite/intel-integrated-performance-primitives.md rename to docs.it4i/software/intel-suite/intel-integrated-performance-primitives.md index 60628eed0744d4305f79f4b77ff2f4de8e11c10d..a47233367e4130177be4db677197a07ec26f9fb2 100644 --- a/docs.it4i/salomon/software/intel-suite/intel-integrated-performance-primitives.md +++ b/docs.it4i/software/intel-suite/intel-integrated-performance-primitives.md @@ -15,44 +15,44 @@ The module sets up environment variables, required for linking and running ipp e ## IPP Example ```cpp - #include "ipp.h" - #include <stdio.h> - int main(int argc, char* argv[]) - { - const IppLibraryVersion *lib; - Ipp64u fm; - IppStatus status; - - status= ippInit(); //IPP initialization with the best optimization layer - if( status != ippStsNoErr ) { - printf("IppInit() Error:n"); - printf("%sn", ippGetStatusString(status) ); - return -1; - } - - //Get version info - lib = ippiGetLibVersion(); - printf("%s %sn", lib->Name, lib->Version); - - //Get CPU features enabled with selected library level - fm=ippGetEnabledCpuFeatures(); - printf("SSE :%cn",(fm>1)&1?'Y':'N'); - printf("SSE2 :%cn",(fm>2)&1?'Y':'N'); - printf("SSE3 :%cn",(fm>3)&1?'Y':'N'); - printf("SSSE3 :%cn",(fm>4)&1?'Y':'N'); - printf("SSE41 :%cn",(fm>6)&1?'Y':'N'); - printf("SSE42 :%cn",(fm>7)&1?'Y':'N'); - printf("AVX :%cn",(fm>8)&1 ?'Y':'N'); - printf("AVX2 :%cn", (fm>15)&1 ?'Y':'N' ); - printf("----------n"); - printf("OS Enabled AVX :%cn", (fm>9)&1 ?'Y':'N'); - printf("AES :%cn", (fm>10)&1?'Y':'N'); - printf("CLMUL :%cn", (fm>11)&1?'Y':'N'); - printf("RDRAND :%cn", (fm>13)&1?'Y':'N'); - printf("F16C :%cn", (fm>14)&1?'Y':'N'); - - return 0; +#include "ipp.h" +#include <stdio.h> +int main(int argc, char* argv[]) +{ + const IppLibraryVersion *lib; + Ipp64u fm; + IppStatus status; + + status= ippInit(); //IPP initialization with the best optimization layer + if( status != ippStsNoErr ) { + printf("IppInit() Error:n"); + printf("%sn", ippGetStatusString(status) ); + return -1; } + + //Get version info + lib = ippiGetLibVersion(); + printf("%s %sn", lib->Name, lib->Version); + + //Get CPU features enabled with selected library level + fm=ippGetEnabledCpuFeatures(); + printf("SSE :%cn",(fm>1)&1?'Y':'N'); + printf("SSE2 :%cn",(fm>2)&1?'Y':'N'); + printf("SSE3 :%cn",(fm>3)&1?'Y':'N'); + printf("SSSE3 :%cn",(fm>4)&1?'Y':'N'); + printf("SSE41 :%cn",(fm>6)&1?'Y':'N'); + printf("SSE42 :%cn",(fm>7)&1?'Y':'N'); + printf("AVX :%cn",(fm>8)&1 ?'Y':'N'); + printf("AVX2 :%cn", (fm>15)&1 ?'Y':'N' ); + printf("----------n"); + printf("OS Enabled AVX :%cn", (fm>9)&1 ?'Y':'N'); + printf("AES :%cn", (fm>10)&1?'Y':'N'); + printf("CLMUL :%cn", (fm>11)&1?'Y':'N'); + printf("RDRAND :%cn", (fm>13)&1?'Y':'N'); + printf("F16C :%cn", (fm>14)&1?'Y':'N'); + + return 0; +} ``` Compile above example, using any compiler and the ipp module. diff --git a/docs.it4i/salomon/software/intel-suite/intel-mkl.md b/docs.it4i/software/intel-suite/intel-mkl.md similarity index 98% rename from docs.it4i/salomon/software/intel-suite/intel-mkl.md rename to docs.it4i/software/intel-suite/intel-mkl.md index 6b54e0890202f817dd42c04eabf886489bd695d0..2053e958b2673acb4fc79e4e552bea5cf016d85e 100644 --- a/docs.it4i/salomon/software/intel-suite/intel-mkl.md +++ b/docs.it4i/software/intel-suite/intel-mkl.md @@ -15,9 +15,10 @@ Intel Math Kernel Library (Intel MKL) is a library of math kernel subroutines, e For details see the [Intel MKL Reference Manual](http://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mklman/index.htm). -Intel MKL version 11.2.3.187 is available on the cluster +Intel MKL is available on the cluster ```console +$ ml av imkl $ ml imkl ``` @@ -51,7 +52,7 @@ Advantage in using Intel MKL library is that it brings threaded parallelization For this to work, the application must link the threaded MKL library (default). Number and behaviour of MKL threads may be controlled via the OpenMP environment variables, such as OMP_NUM_THREADS and KMP_AFFINITY. MKL_NUM_THREADS takes precedence over OMP_NUM_THREADS ```console -$ export OMP_NUM_THREADS=24 +$ export OMP_NUM_THREADS=24 # 16 for Anselm $ export KMP_AFFINITY=granularity=fine,compact,1,0 ``` diff --git a/docs.it4i/salomon/software/intel-suite/intel-parallel-studio-introduction.md b/docs.it4i/software/intel-suite/intel-parallel-studio-introduction.md similarity index 92% rename from docs.it4i/salomon/software/intel-suite/intel-parallel-studio-introduction.md rename to docs.it4i/software/intel-suite/intel-parallel-studio-introduction.md index b22274a0e0a4c32942b15ba90244621eba21aa54..7b6ba956b932b63d535dc0e3aeb7667385fdccf8 100644 --- a/docs.it4i/salomon/software/intel-suite/intel-parallel-studio-introduction.md +++ b/docs.it4i/software/intel-suite/intel-parallel-studio-introduction.md @@ -15,7 +15,7 @@ Intel Parallel Studio XE ## Intel Compilers -The Intel compilers version 131.3 are available, via module iccifort/2013.5.192-GCC-4.8.3. The compilers include the icc C and C++ compiler and the ifort fortran 77/90/95 compiler. +The Intel compilers are available, via module intel. The compilers include the icc C and C++ compiler and the ifort fortran 77/90/95 compiler. ```console $ ml intel diff --git a/docs.it4i/salomon/software/intel-suite/intel-tbb.md b/docs.it4i/software/intel-suite/intel-tbb.md similarity index 100% rename from docs.it4i/salomon/software/intel-suite/intel-tbb.md rename to docs.it4i/software/intel-suite/intel-tbb.md diff --git a/docs.it4i/salomon/software/intel-suite/intel-trace-analyzer-and-collector.md b/docs.it4i/software/intel-suite/intel-trace-analyzer-and-collector.md similarity index 90% rename from docs.it4i/salomon/software/intel-suite/intel-trace-analyzer-and-collector.md rename to docs.it4i/software/intel-suite/intel-trace-analyzer-and-collector.md index 9cae361ca43dccb382bd5b09f5c5a9d270e0414c..b7bf6c92d3a03112392a86078037aeff28e8623f 100644 --- a/docs.it4i/salomon/software/intel-suite/intel-trace-analyzer-and-collector.md +++ b/docs.it4i/software/intel-suite/intel-trace-analyzer-and-collector.md @@ -21,7 +21,7 @@ The trace will be saved in file myapp.stf in the current directory. ## Viewing Traces -To view and analyze the trace, open ITAC GUI in a [graphical environment](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/): +To view and analyze the trace, open ITAC GUI in a [graphical environment](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/): ```console $ ml itac/9.1.2.024 @@ -30,7 +30,7 @@ $ traceanalyzer The GUI will launch and you can open the produced `*`.stf file. - + Please refer to Intel documenation about usage of the GUI tool. diff --git a/docs.it4i/anselm/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.anselm.md similarity index 77% rename from docs.it4i/anselm/software/intel-xeon-phi.md rename to docs.it4i/software/intel-xeon-phi.anselm.md index d879361135e715e4af6862ed6636adb45a895fb1..b1e86256d093b4bd34fe799e48f64d38f48d0e83 100644 --- a/docs.it4i/anselm/software/intel-xeon-phi.md +++ b/docs.it4i/software/intel-xeon-phi.anselm.md @@ -27,65 +27,65 @@ $ /usr/bin/micinfo The output of the "micinfo" utility executed on one of the Anselm node is as follows. (note: to get PCIe related details the command has to be run with root privileges) ```console - MicInfo Utility Log - - Created Mon Jul 22 00:23:50 2013 - - System Info - HOST OS : Linux - OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64 - Driver Version : 6720-15 - MPSS Version : 2.1.6720-15 - Host Physical Memory : 98843 MB - - Device No: 0, Device Name: mic0 - - Version - Flash Version : 2.1.03.0386 - SMC Firmware Version : 1.15.4830 - SMC Boot Loader Version : 1.8.4326 - uOS Version : 2.6.38.8-g2593b11 - Device Serial Number : ADKC30102482 - - Board - Vendor ID : 0x8086 - Device ID : 0x2250 - Subsystem ID : 0x2500 - Coprocessor Stepping ID : 3 - PCIe Width : x16 - PCIe Speed : 5 GT/s - PCIe Max payload size : 256 bytes - PCIe Max read req size : 512 bytes - Coprocessor Model : 0x01 - Coprocessor Model Ext : 0x00 - Coprocessor Type : 0x00 - Coprocessor Family : 0x0b - Coprocessor Family Ext : 0x00 - Coprocessor Stepping : B1 - Board SKU : B1PRQ-5110P/5120D - ECC Mode : Enabled - SMC HW Revision : Product 225W Passive CS - - Cores - Total No of Active Cores : 60 - Voltage : 1032000 uV - Frequency : 1052631 kHz - - Thermal - Fan Speed Control : N/A - Fan RPM : N/A - Fan PWM : N/A - Die Temp : 49 C - - GDDR - GDDR Vendor : Elpida - GDDR Version : 0x1 - GDDR Density : 2048 Mb - GDDR Size : 7936 MB - GDDR Technology : GDDR5 - GDDR Speed : 5.000000 GT/s - GDDR Frequency : 2500000 kHz - GDDR Voltage : 1501000 uV +MicInfo Utility Log +Created Wed Sep 13 13:44:14 2017 + + + System Info + HOST OS : Linux + OS Version : 2.6.32-696.3.2.el6.Bull.120.x86_64 + Driver Version : 3.4.9-1 + MPSS Version : 3.4.9 + Host Physical Memory : 98836 MB + +Device No: 0, Device Name: mic0 + + Version + Flash Version : 2.1.02.0391 + SMC Firmware Version : 1.17.6900 + SMC Boot Loader Version : 1.8.4326 + uOS Version : 2.6.38.8+mpss3.4.9 + Device Serial Number : ADKC30102489 + + Board + Vendor ID : 0x8086 + Device ID : 0x2250 + Subsystem ID : 0x2500 + Coprocessor Stepping ID : 3 + PCIe Width : x16 + PCIe Speed : 5 GT/s + PCIe Max payload size : 256 bytes + PCIe Max read req size : 512 bytes + Coprocessor Model : 0x01 + Coprocessor Model Ext : 0x00 + Coprocessor Type : 0x00 + Coprocessor Family : 0x0b + Coprocessor Family Ext : 0x00 + Coprocessor Stepping : B1 + Board SKU : B1PRQ-5110P/5120D + ECC Mode : Enabled + SMC HW Revision : Product 225W Passive CS + + Cores + Total No of Active Cores : 60 + Voltage : 1009000 uV + Frequency : 1052631 kHz + + Thermal + Fan Speed Control : N/A + Fan RPM : N/A + Fan PWM : N/A + Die Temp : 53 C + + GDDR + GDDR Vendor : Elpida + GDDR Version : 0x1 + GDDR Density : 2048 Mb + GDDR Size : 7936 MB + GDDR Technology : GDDR5 + GDDR Speed : 5.000000 GT/s + GDDR Frequency : 2500000 kHz + GDDR Voltage : 1501000 uV ``` ## Offload Mode @@ -108,24 +108,24 @@ A very basic example of code that employs offload programming technique is shown !!! note This code is sequential and utilizes only single core of the accelerator. -```console +```cpp $ vim source-offload.cpp - #include <iostream> +#include <iostream> - int main(int argc, char* argv[]) - { - const int niter = 100000; - double result = 0; +int main(int argc, char* argv[]) +{ + const int niter = 100000; + double result = 0; - #pragma offload target(mic) - for (int i = 0; i < niter; ++i) { - const double t = (i + 0.5) / niter; - result += 4.0 / (t * t + 1.0); - } - result /= niter; - std::cout << "Pi ~ " << result << 'n'; + #pragma offload target(mic) + for (int i = 0; i < niter; ++i) { + const double t = (i + 0.5) / niter; + result += 4.0 / (t * t + 1.0); } + result /= niter; + std::cout << "Pi ~ " << result << 'n'; +} ``` To compile a code using Intel compiler run @@ -144,82 +144,82 @@ $ ./bin-offload One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition. -```console +```cpp $ vim ./vect-add - #include <stdio.h> - - typedef int T; - - #define SIZE 1000 - - #pragma offload_attribute(push, target(mic)) - T in1[SIZE]; - T in2[SIZE]; - T res[SIZE]; - #pragma offload_attribute(pop) - - // MIC function to add two vectors - __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { - int i = 0; - #pragma omp parallel for - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; +#include <stdio.h> + +typedef int T; + +#define SIZE 1000 + +#pragma offload_attribute(push, target(mic)) +T in1[SIZE]; +T in2[SIZE]; +T res[SIZE]; +#pragma offload_attribute(pop) + +// MIC function to add two vectors +__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { + int i = 0; + #pragma omp parallel for + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} + +// CPU function to add two vectors +void add_cpu (T *a, T *b, T *c, int size) { + int i; + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} + +// CPU function to generate a vector of random numbers +void random_T (T *a, int size) { + int i; + for (i = 0; i < size; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 +} + +// CPU function to compare two vectors +int compare(T *a, T *b, T size ){ + int pass = 0; + int i; + for (i = 0; i < size; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; +} - // CPU function to add two vectors - void add_cpu (T *a, T *b, T *c, int size) { - int i; - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; - } +int main() +{ + int i; + random_T(in1, SIZE); + random_T(in2, SIZE); - // CPU function to generate a vector of random numbers - void random_T (T *a, int size) { - int i; - for (i = 0; i < size; i++) - a[i] = rand() % 10000; // random number between 0 and 9999 - } + #pragma offload target(mic) in(in1,in2) inout(res) + { - // CPU function to compare two vectors - int compare(T *a, T *b, T size ){ - int pass = 0; - int i; - for (i = 0; i < size; i++){ - if (a[i] != b[i]) { - printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); - pass = 1; - } - } - if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); - return pass; - } + // Parallel loop from main function + #pragma omp parallel for + for (i=0; i<SIZE; i++) + res[i] = in1[i] + in2[i]; - int main() - { - int i; - random_T(in1, SIZE); - random_T(in2, SIZE); + // or parallel loop is called inside the function + add_mic(in1, in2, res, SIZE); - #pragma offload target(mic) in(in1,in2) inout(res) - { + } - // Parallel loop from main function - #pragma omp parallel for - for (i=0; i<SIZE; i++) - res[i] = in1[i] + in2[i]; + //Check the results with CPU implementation + T res_cpu[SIZE]; + add_cpu(in1, in2, res_cpu, SIZE); + compare(res, res_cpu, SIZE); - // or parallel loop is called inside the function - add_mic(in1, in2, res, SIZE); - - } - - //Check the results with CPU implementation - T res_cpu[SIZE]; - add_cpu(in1, in2, res_cpu, SIZE); - compare(res, res_cpu, SIZE); - - } +} ``` During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run @@ -271,61 +271,61 @@ $ module load intel Following example show how to automatically offload an SGEMM (single precision - general matrix multiply) function to MIC coprocessor. The code can be copied to a file and compiled without any necessary modification. -```console +```cpp $ vim sgemm-ao-short.c - #include <stdio.h> - #include <stdlib.h> - #include <malloc.h> - #include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <stdint.h> - #include "mkl.h" +#include "mkl.h" - int main(int argc, char **argv) - { - float *A, *B, *C; /* Matrices */ +int main(int argc, char **argv) +{ + float *A, *B, *C; /* Matrices */ - MKL_INT N = 2560; /* Matrix dimensions */ - MKL_INT LD = N; /* Leading dimension */ - int matrix_bytes; /* Matrix size in bytes */ - int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ - float alpha = 1.0, beta = 1.0; /* Scaling factors */ - char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ - int i, j; /* Counters */ + int i, j; /* Counters */ - matrix_elements = N * N; - matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; - /* Allocate the matrices */ - A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); - /* Initialize the matrices */ - for (i = 0; i < matrix_elements; i++) { - A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; - } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } - printf("Computing SGEMM on the hostn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the hostn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Enabling Automatic Offloadn"); - /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ - mkl_mic_enable(); + printf("Enabling Automatic Offloadn"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); - int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ - printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); - printf("Computing SGEMM with automatic workdivisionn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM with automatic workdivisionn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - /* Free the matrix memory */ - free(A); free(B); free(C); + /* Free the matrix memory */ + free(A); free(B); free(C); - printf("Donen"); + printf("Donen"); - return 0; - } + return 0; +} ``` !!! note @@ -573,28 +573,28 @@ $ mpiicc -mmic -o mpi-test-mic mpi-test.c An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file) ```cpp - #include <stdio.h> - #include <mpi.h> +#include <stdio.h> +#include <mpi.h> - int main (argc, argv) - int argc; - char *argv[]; - { - int rank, size; +int main (argc, argv) + int argc; + char *argv[]; +{ + int rank, size; - int len; - char node[MPI_MAX_PROCESSOR_NAME]; + int len; + char node[MPI_MAX_PROCESSOR_NAME]; - MPI_Init (&argc, &argv); /* starts MPI */ - MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ - MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ + MPI_Init (&argc, &argv); /* starts MPI */ + MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ + MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ - MPI_Get_processor_name(node,&len); + MPI_Get_processor_name(node,&len); - printf( "Hello world from process %d of %d on host %s n", rank, size, node ); - MPI_Finalize(); - return 0; - } + printf( "Hello world from process %d of %d on host %s n", rank, size, node ); + MPI_Finalize(); + return 0; +} ``` ### MPI Programming Models diff --git a/docs.it4i/salomon/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.md similarity index 75% rename from docs.it4i/salomon/software/intel-xeon-phi.md rename to docs.it4i/software/intel-xeon-phi.md index 6d161439b7871e097ae095a4103a1f37ab490a0e..f09e343ce7c02c194c8d1406cc374442d0be4249 100644 --- a/docs.it4i/salomon/software/intel-xeon-phi.md +++ b/docs.it4i/software/intel-xeon-phi.md @@ -28,113 +28,113 @@ The output of the "micinfo" utility executed on one of the cluster node is as fo ```console MicInfo Utility Log -Created Mon Aug 17 13:55:59 2015 +Created Wed Sep 13 13:39:28 2017 - System Info - HOST OS : Linux - OS Version : 2.6.32-504.16.2.el6.x86_64 - Driver Version : 3.4.1-1 - MPSS Version : 3.4.1 - Host Physical Memory : 131930 MB + System Info + HOST OS : Linux + OS Version : 2.6.32-696.3.2.el6.x86_64 + Driver Version : 3.8.2-1 + MPSS Version : 3.8.2 + Host Physical Memory : 128838 MB Device No: 0, Device Name: mic0 - Version - Flash Version : 2.1.02.0390 - SMC Firmware Version : 1.16.5078 - SMC Boot Loader Version : 1.8.4326 - uOS Version : 2.6.38.8+mpss3.4.1 - Device Serial Number : ADKC44601414 - - Board - Vendor ID : 0x8086 - Device ID : 0x225c - Subsystem ID : 0x7d95 - Coprocessor Stepping ID : 2 - PCIe Width : x16 - PCIe Speed : 5 GT/s - PCIe Max payload size : 256 bytes - PCIe Max read req size : 512 bytes - Coprocessor Model : 0x01 - Coprocessor Model Ext : 0x00 - Coprocessor Type : 0x00 - Coprocessor Family : 0x0b - Coprocessor Family Ext : 0x00 - Coprocessor Stepping : C0 - Board SKU : C0PRQ-7120 P/A/X/D - ECC Mode : Enabled - SMC HW Revision : Product 300W Passive CS - - Cores - Total No of Active Cores : 61 - Voltage : 1007000 uV - Frequency : 1238095 kHz - - Thermal - Fan Speed Control : N/A - Fan RPM : N/A - Fan PWM : N/A - Die Temp : 60 C - - GDDR - GDDR Vendor : Samsung - GDDR Version : 0x6 - GDDR Density : 4096 Mb - GDDR Size : 15872 MB - GDDR Technology : GDDR5 - GDDR Speed : 5.500000 GT/s - GDDR Frequency : 2750000 kHz - GDDR Voltage : 1501000 uV + Version + Flash Version : 2.1.02.0391 + SMC Firmware Version : 1.17.6900 + SMC Boot Loader Version : 1.8.4326 + Coprocessor OS Version : 2.6.38.8+mpss3.8.2 + Device Serial Number : ADKC44601725 + + Board + Vendor ID : 0x8086 + Device ID : 0x225c + Subsystem ID : 0x7d95 + Coprocessor Stepping ID : 2 + PCIe Width : x16 + PCIe Speed : 5 GT/s + PCIe Max payload size : 256 bytes + PCIe Max read req size : 512 bytes + Coprocessor Model : 0x01 + Coprocessor Model Ext : 0x00 + Coprocessor Type : 0x00 + Coprocessor Family : 0x0b + Coprocessor Family Ext : 0x00 + Coprocessor Stepping : C0 + Board SKU : C0PRQ-7120 P/A/X/D + ECC Mode : Enabled + SMC HW Revision : Product 300W Passive CS + + Cores + Total No of Active Cores : 61 + Voltage : 1041000 uV + Frequency : 1238095 kHz + + Thermal + Fan Speed Control : N/A + Fan RPM : N/A + Fan PWM : N/A + Die Temp : 50 C + + GDDR + GDDR Vendor : Samsung + GDDR Version : 0x6 + GDDR Density : 4096 Mb + GDDR Size : 15872 MB + GDDR Technology : GDDR5 + GDDR Speed : 5.500000 GT/s + GDDR Frequency : 2750000 kHz + GDDR Voltage : 1501000 uV Device No: 1, Device Name: mic1 - Version - Flash Version : 2.1.02.0390 - SMC Firmware Version : 1.16.5078 - SMC Boot Loader Version : 1.8.4326 - uOS Version : 2.6.38.8+mpss3.4.1 - Device Serial Number : ADKC44500454 - - Board - Vendor ID : 0x8086 - Device ID : 0x225c - Subsystem ID : 0x7d95 - Coprocessor Stepping ID : 2 - PCIe Width : x16 - PCIe Speed : 5 GT/s - PCIe Max payload size : 256 bytes - PCIe Max read req size : 512 bytes - Coprocessor Model : 0x01 - Coprocessor Model Ext : 0x00 - Coprocessor Type : 0x00 - Coprocessor Family : 0x0b - Coprocessor Family Ext : 0x00 - Coprocessor Stepping : C0 - Board SKU : C0PRQ-7120 P/A/X/D - ECC Mode : Enabled - SMC HW Revision : Product 300W Passive CS - - Cores - Total No of Active Cores : 61 - Voltage : 998000 uV - Frequency : 1238095 kHz - - Thermal - Fan Speed Control : N/A - Fan RPM : N/A - Fan PWM : N/A - Die Temp : 59 C - - GDDR - GDDR Vendor : Samsung - GDDR Version : 0x6 - GDDR Density : 4096 Mb - GDDR Size : 15872 MB - GDDR Technology : GDDR5 - GDDR Speed : 5.500000 GT/s - GDDR Frequency : 2750000 kHz - GDDR Voltage : 1501000 uV + Version + Flash Version : 2.1.02.0391 + SMC Firmware Version : 1.17.6900 + SMC Boot Loader Version : 1.8.4326 + Coprocessor OS Version : 2.6.38.8+mpss3.8.2 + Device Serial Number : ADKC44601893 + + Board + Vendor ID : 0x8086 + Device ID : 0x225c + Subsystem ID : 0x7d95 + Coprocessor Stepping ID : 2 + PCIe Width : x16 + PCIe Speed : 5 GT/s + PCIe Max payload size : 256 bytes + PCIe Max read req size : 512 bytes + Coprocessor Model : 0x01 + Coprocessor Model Ext : 0x00 + Coprocessor Type : 0x00 + Coprocessor Family : 0x0b + Coprocessor Family Ext : 0x00 + Coprocessor Stepping : C0 + Board SKU : C0PRQ-7120 P/A/X/D + ECC Mode : Enabled + SMC HW Revision : Product 300W Passive CS + + Cores + Total No of Active Cores : 61 + Voltage : 1053000 uV + Frequency : 1238095 kHz + + Thermal + Fan Speed Control : N/A + Fan RPM : N/A + Fan PWM : N/A + Die Temp : 48 C + + GDDR + GDDR Vendor : Samsung + GDDR Version : 0x6 + GDDR Density : 4096 Mb + GDDR Size : 15872 MB + GDDR Technology : GDDR5 + GDDR Speed : 5.500000 GT/s + GDDR Frequency : 2750000 kHz + GDDR Voltage : 1501000 uV ``` ## Offload Mode @@ -154,7 +154,7 @@ export OFFLOAD_REPORT=3 A very basic example of code that employs offload programming technique is shown in the next listing. Please note that this code is sequential and utilizes only single core of the accelerator. -```console +```cpp $ cat source-offload.cpp #include <iostream> @@ -190,82 +190,82 @@ $ ./bin-offload One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition. -```console +```cpp $ cat ./vect-add - #include <stdio.h> +#include <stdio.h> - typedef int T; +typedef int T; - #define SIZE 1000 +#define SIZE 1000 - #pragma offload_attribute(push, target(mic)) - T in1[SIZE]; - T in2[SIZE]; - T res[SIZE]; - #pragma offload_attribute(pop) +#pragma offload_attribute(push, target(mic)) +T in1[SIZE]; +T in2[SIZE]; +T res[SIZE]; +#pragma offload_attribute(pop) - // MIC function to add two vectors - __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { - int i = 0; - #pragma omp parallel for - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; - } +// MIC function to add two vectors +__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { + int i = 0; + #pragma omp parallel for + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} - // CPU function to add two vectors - void add_cpu (T *a, T *b, T *c, int size) { - int i; - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; - } +// CPU function to add two vectors +void add_cpu (T *a, T *b, T *c, int size) { + int i; + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} - // CPU function to generate a vector of random numbers - void random_T (T *a, int size) { - int i; - for (i = 0; i < size; i++) - a[i] = rand() % 10000; // random number between 0 and 9999 - } +// CPU function to generate a vector of random numbers +void random_T (T *a, int size) { + int i; + for (i = 0; i < size; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 +} - // CPU function to compare two vectors - int compare(T *a, T *b, T size ){ - int pass = 0; - int i; - for (i = 0; i < size; i++){ - if (a[i] != b[i]) { - printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); - pass = 1; - } - } - if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); - return pass; +// CPU function to compare two vectors +int compare(T *a, T *b, T size ){ + int pass = 0; + int i; + for (i = 0; i < size; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; +} - int main() - { - int i; - random_T(in1, SIZE); - random_T(in2, SIZE); +int main() +{ + int i; + random_T(in1, SIZE); + random_T(in2, SIZE); - #pragma offload target(mic) in(in1,in2) inout(res) - { + #pragma offload target(mic) in(in1,in2) inout(res) + { - // Parallel loop from main function - #pragma omp parallel for - for (i=0; i<SIZE; i++) - res[i] = in1[i] + in2[i]; + // Parallel loop from main function + #pragma omp parallel for + for (i=0; i<SIZE; i++) + res[i] = in1[i] + in2[i]; - // or parallel loop is called inside the function - add_mic(in1, in2, res, SIZE); + // or parallel loop is called inside the function + add_mic(in1, in2, res, SIZE); - } + } - //Check the results with CPU implementation - T res_cpu[SIZE]; - add_cpu(in1, in2, res_cpu, SIZE); - compare(res, res_cpu, SIZE); + //Check the results with CPU implementation + T res_cpu[SIZE]; + add_cpu(in1, in2, res_cpu, SIZE); + compare(res, res_cpu, SIZE); - } +} ``` During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run @@ -315,9 +315,9 @@ $ qsub -I -q qprod -l select=1:ncpus=24:accelerator=True:naccelerators=2:acceler $ ml intel ``` -The code can be copied to a file and compiled without any necessary modification. +The code can be copied to a file and compiled without any necessary modification. -```console +```cpp $ vim sgemm-ao-short.c #include <stdio.h> @@ -329,46 +329,46 @@ $ vim sgemm-ao-short.c int main(int argc, char **argv) { - float *A, *B, *C; /* Matrices */ + float *A, *B, *C; /* Matrices */ - MKL_INT N = 2560; /* Matrix dimensions */ - MKL_INT LD = N; /* Leading dimension */ - int matrix_bytes; /* Matrix size in bytes */ - int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ - float alpha = 1.0, beta = 1.0; /* Scaling factors */ - char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ - int i, j; /* Counters */ + int i, j; /* Counters */ - matrix_elements = N * N; - matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; - /* Allocate the matrices */ - A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); - /* Initialize the matrices */ - for (i = 0; i < matrix_elements; i++) { - A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; - } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } - printf("Computing SGEMM on the host\n"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the host\n"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Enabling Automatic Offload\n"); - /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ - mkl_mic_enable(); - - int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ - printf("Automatic Offload enabled: %d MIC devices present\n", ndevices); + printf("Enabling Automatic Offload\n"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); - printf("Computing SGEMM with automatic workdivision\n"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices present\n", ndevices); - /* Free the matrix memory */ - free(A); free(B); free(C); + printf("Computing SGEMM with automatic workdivision\n"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Done\n"); + /* Free the matrix memory */ + free(A); free(B); free(C); + + printf("Done\n"); return 0; } @@ -392,7 +392,7 @@ $ export OFFLOAD_REPORT=2 The output of a code should look similar to following listing, where lines starting with [MKL] are generated by offload reporting: ```console -[user@r31u03n799 ~]$ ./sgemm +[user@r31u03n799 ~]$ ./sgemm Computing SGEMM on the host Enabling Automatic Offload Automatic Offload enabled: 2 MIC devices present @@ -517,10 +517,10 @@ For your information the list of libraries and their location required for execu !!! note /apps/all/icc/2015.3.187-GNU-5.1.0-2.25/composer_xe_2015.3.187/compiler/lib/mic - libiomp5.so - libimf.so - libsvml.so - libirng.so + libiomp5.so + libimf.so + libsvml.so + libirng.so libintlc.so.5 Finally, to run the compiled code use: @@ -566,29 +566,29 @@ $ g++ capsbasic.cpp -lOpenCL -o capsbasic -I/apps/intel/opencl/include/ After executing the complied binary file, following output should be displayed. ```console - ./capsbasic +./capsbasic - Number of available platforms: 1 - Platform names: - [0] Intel(R) OpenCL [Selected] - Number of devices available for each type: - CL_DEVICE_TYPE_CPU: 1 - CL_DEVICE_TYPE_GPU: 0 - CL_DEVICE_TYPE_ACCELERATOR: 1 +Number of available platforms: 1 +Platform names: + [0] Intel(R) OpenCL [Selected] +Number of devices available for each type: + CL_DEVICE_TYPE_CPU: 1 + CL_DEVICE_TYPE_GPU: 0 + CL_DEVICE_TYPE_ACCELERATOR: 1 - ** Detailed information for each device *** +** Detailed information for each device *** - CL_DEVICE_TYPE_CPU[0] - CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz - CL_DEVICE_AVAILABLE: 1 +CL_DEVICE_TYPE_CPU[0] + CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz + CL_DEVICE_AVAILABLE: 1 - ... +... - CL_DEVICE_TYPE_ACCELERATOR[0] - CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card - CL_DEVICE_AVAILABLE: 1 +CL_DEVICE_TYPE_ACCELERATOR[0] + CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card + CL_DEVICE_AVAILABLE: 1 - ... +... ``` !!! note @@ -612,23 +612,23 @@ $ g++ cmdoptions.cpp gemm.cpp ../common/basic.cpp ../common/cmdparser.cpp ../com To see the performance of Intel Xeon Phi performing the DGEMM run the example as follows: ```console - ./gemm -d 1 - Platforms (1): - [0] Intel(R) OpenCL [Selected] - Devices (2): - [0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz - [1] Intel(R) Many Integrated Core Acceleration Card [Selected] - Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8" - Running gemm_nn kernel with matrix size: 3968x3968 - Memory row stride to ensure necessary alignment: 15872 bytes - Size of memory region for one matrix: 62980096 bytes - Using alpha = 0.57599 and beta = 0.872412 - ... - Host time: 0.292953 sec. - Host perf: 426.635 GFLOPS - Host time: 0.293334 sec. - Host perf: 426.081 GFLOPS - ... +./gemm -d 1 +Platforms (1): + [0] Intel(R) OpenCL [Selected] +Devices (2): + [0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz + [1] Intel(R) Many Integrated Core Acceleration Card [Selected] +Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8" +Running gemm_nn kernel with matrix size: 3968x3968 +Memory row stride to ensure necessary alignment: 15872 bytes +Size of memory region for one matrix: 62980096 bytes +Using alpha = 0.57599 and beta = 0.872412 +... +Host time: 0.292953 sec. +Host perf: 426.635 GFLOPS +Host time: 0.293334 sec. +Host perf: 426.081 GFLOPS +... ``` !!! hint @@ -648,7 +648,7 @@ $ export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1u,ofa-v2-scif0,ofa-v2-mcm-1 This ensures, that MPI inside node will use SHMEM communication, between HOST and Phi the IB SCIF will be used and between different nodes or Phi's on diferent nodes a CCL-Direct proxy will be used. !!! note - Other FABRICS like tcp,ofa may be used (even combined with shm) but there's severe loss of performance (by order of magnitude). + Other FABRICS like tcp,ofa may be used (even combined with shm) but there's severe loss of performance (by order of magnitude). Usage of single DAPL PROVIDER (e. g. I_MPI_DAPL_PROVIDER=ofa-v2-mlx4_0-1u) will cause failure of Host<->Phi and/or Phi<->Phi communication. Usage of the I_MPI_DAPL_PROVIDER_LIST on non-accelerated node will cause failure of any MPI communication, since those nodes don't have SCIF device and there's no CCL-Direct proxy runnig. @@ -685,28 +685,28 @@ $ mpiifort -mmic -o mpi-test-mic mpi-test.f90 An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file) ```cpp - #include <stdio.h> - #include <mpi.h> +#include <stdio.h> +#include <mpi.h> - int main (argc, argv) - int argc; - char *argv[]; - { - int rank, size; +int main (argc, argv) + int argc; + char *argv[]; +{ + int rank, size; - int len; - char node[MPI_MAX_PROCESSOR_NAME]; + int len; + char node[MPI_MAX_PROCESSOR_NAME]; - MPI_Init (&argc, &argv); /* starts MPI */ - MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ - MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ + MPI_Init (&argc, &argv); /* starts MPI */ + MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ + MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ - MPI_Get_processor_name(node,&len); + MPI_Get_processor_name(node,&len); - printf( "Hello world from process %d of %d on host %s n", rank, size, node ); - MPI_Finalize(); - return 0; - } + printf( "Hello world from process %d of %d on host %s n", rank, size, node ); + MPI_Finalize(); + return 0; +} ``` ### MPI Programming Models diff --git a/docs.it4i/anselm/software/isv_licenses.md b/docs.it4i/software/isv_licenses.md similarity index 100% rename from docs.it4i/anselm/software/isv_licenses.md rename to docs.it4i/software/isv_licenses.md diff --git a/docs.it4i/salomon/software/java.md b/docs.it4i/software/java.md similarity index 100% rename from docs.it4i/salomon/software/java.md rename to docs.it4i/software/java.md diff --git a/docs.it4i/anselm/software/machine-learning/introduction.md b/docs.it4i/software/machine-learning/introduction.md similarity index 86% rename from docs.it4i/anselm/software/machine-learning/introduction.md rename to docs.it4i/software/machine-learning/introduction.md index 49a1e4f0586c94ed44c04ae1ea16f7ed1636e1db..9b1db80372f81bc32683e03df033c9c4aee75846 100644 --- a/docs.it4i/anselm/software/machine-learning/introduction.md +++ b/docs.it4i/software/machine-learning/introduction.md @@ -1,6 +1,6 @@ # Machine Learning -This section overviews machine learning frameworks and libraries available on the Anselm cluster. +This section overviews machine learning frameworks and libraries available the clusters. ## TensorFlow @@ -16,4 +16,4 @@ Test module: $ ml Tensorflow ``` -Read more about available versions at the [TensorFlow page](tensorflow). +Read more about available versions at the [TensorFlow page](tensorflow/). diff --git a/docs.it4i/software/machine-learning/tensorflow.md b/docs.it4i/software/machine-learning/tensorflow.md new file mode 100644 index 0000000000000000000000000000000000000000..26344cf2d7a33b7de70fe11c992cf410d6e296e9 --- /dev/null +++ b/docs.it4i/software/machine-learning/tensorflow.md @@ -0,0 +1,55 @@ +# TensorFlow + +TensorFlow is an open-source software library for machine intelligence. +For searching available modules type: + +```console +$ ml av Tensorflow +``` + +## Anselm modules + +Anselm provides beside others these three different TensorFlow modules: + +* Tensorflow/1.1.0 (CPU only, not recommended), module built with: + * GCC/4.9.3 + * Python/3.6.1 +* Tensorflow/1.1.0-CUDA-7.5.18-Python-3.6.1 (GPU enabled), module built with: + * GCC/4.9.3 + * Python/3.6.1 + * CUDA/7.5.18 + * cuDNN/5.1-CUDA-7.5.18 +* Tensorflow/1.1.0-CUDA-8.0.44-Python-3.6.1 (GPU enabled), module built with: + * GCC/4.9.3 + * Python/3.6.1 + * CUDA/8.0.44 + * cuDNN/5.1-CUDA-8.0.44 + +## Salomon modules + +Salomon provides beside others these three different TensorFlow modules: + +* Tensorflow/1.1.0 (not recommended), module built with: + * GCC/4.9.3 + * Python/3.6.1 +* Tensorflow/1.2.0-GCC-7.1.0-2.28 (default, recommended), module built with: + * TensorFlow 1.2 with SIMD support. TensorFlow build taking advantage of the Salomon CPU architecture. + * GCC/7.1.0-2.28 + * Python/3.6.1 + * protobuf/3.2.0-GCC-7.1.0-2.28-Python-3.6.1 +* Tensorflow/1.2.0-intel-2017.05-mkl (TensorFlow 1.2 with MKL support), module built with: + * icc/2017.4.196-GCC-7.1.0-2.28 + * Python/3.6.1 + * protobuf/3.2.0-GCC-7.1.0-2.28-Python-3.6.1 + +## TensorFlow application example + +After loading one of the available TensorFlow modules, you can check the functionality running the following python script. + +```python +import tensorflow as tf + +c = tf.constant('Hello World!') +sess = tf.Session() +print(sess.run(c)) +``` diff --git a/docs.it4i/anselm/software/mpi/Running_OpenMPI.md b/docs.it4i/software/mpi/Running_OpenMPI.md similarity index 97% rename from docs.it4i/anselm/software/mpi/Running_OpenMPI.md rename to docs.it4i/software/mpi/Running_OpenMPI.md index 4974eb5b16625faa930a69cded916948257d00a5..5f2606bc2671012b95e53831eb63aa0c95451500 100644 --- a/docs.it4i/anselm/software/mpi/Running_OpenMPI.md +++ b/docs.it4i/software/mpi/Running_OpenMPI.md @@ -2,14 +2,14 @@ ## OpenMPI Program Execution -The OpenMPI programs may be executed only via the PBS Workload manager, by entering an appropriate queue. On Anselm, the **bullxmpi-1.2.4.1** and **OpenMPI 1.6.5** are OpenMPI based MPI implementations. +The OpenMPI programs may be executed only via the PBS Workload manager, by entering an appropriate queue. On Anselm, the **bullxmpi-1.2.4.1** and **OpenMPI 1.6.5** are OpenMPI based MPI implementations. On Salomon, the **OpenMPI 1.8.6** is OpenMPI based MPI implementation. ### Basic Usage !!! note Use the mpiexec to run the OpenMPI code. -Example: +Example (for Anselm): ```console $ qsub -q qexp -l select=4:ncpus=16 -I @@ -52,7 +52,7 @@ The mpiprocs and ompthreads parameters allow for selection of number of running ### One MPI Process Per Node -Follow this example to run one MPI process per node, 16 threads per process. +Follow this example to run one MPI process per node, 16 threads per process (**on Salomon try 24 threads in following examples**). ```console $ qsub -q qexp -l select=4:ncpus=16:mpiprocs=1:ompthreads=16 -I diff --git a/docs.it4i/salomon/software/mpi/mpi.md b/docs.it4i/software/mpi/mpi.md similarity index 86% rename from docs.it4i/salomon/software/mpi/mpi.md rename to docs.it4i/software/mpi/mpi.md index 99f8745aca779ad71a3ab5322499aa9e8bc9fd25..b307a96223a47fd3b8ff86681e2e8b0f7a483d60 100644 --- a/docs.it4i/salomon/software/mpi/mpi.md +++ b/docs.it4i/software/mpi/mpi.md @@ -39,7 +39,7 @@ Examples: $ ml gompi/2015b ``` -In this example, we activate the latest OpenMPI with latest GNU compilers (OpenMPI 1.8.6 and GCC 5.1). Please see more information about toolchains in section [Environment and Modules](../../environment-and-modules/) . +In this example, we activate the latest OpenMPI with latest GNU compilers (OpenMPI 1.8.6 and GCC 5.1). Please see more information about toolchains in section [Environment and Modules](../../modules-matrix/) . To use OpenMPI with the intel compiler suite, use @@ -71,31 +71,31 @@ Wrappers mpif90, mpif77 that are provided by Intel MPI are designed for gcc and Example program: ```cpp - // helloworld_mpi.c - #include <stdio.h> +// helloworld_mpi.c +#include <stdio.h> - #include<mpi.h> +#include<mpi.h> - int main(int argc, char **argv) { +int main(int argc, char **argv) { - int len; - int rank, size; - char node[MPI_MAX_PROCESSOR_NAME]; +int len; +int rank, size; +char node[MPI_MAX_PROCESSOR_NAME]; - // Initiate MPI - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD,&rank); - MPI_Comm_size(MPI_COMM_WORLD,&size); +// Initiate MPI +MPI_Init(&argc, &argv); +MPI_Comm_rank(MPI_COMM_WORLD,&rank); +MPI_Comm_size(MPI_COMM_WORLD,&size); - // Get hostame and print - MPI_Get_processor_name(node,&len); - printf("Hello world! from rank %d of %d on host %sn",rank,size,node); +// Get hostame and print +MPI_Get_processor_name(node,&len); +printf("Hello world! from rank %d of %d on host %sn",rank,size,node); - // Finalize and exit - MPI_Finalize(); +// Finalize and exit +MPI_Finalize(); - return 0; - } +return 0; +} ``` Compile the above example with @@ -117,10 +117,11 @@ The MPI program executable must be available within the same path on all nodes. Optimal way to run an MPI program depends on its memory requirements, memory access pattern and communication pattern. -Consider these ways to run an MPI program: -1\. One MPI process per node, 24 threads per process -2\. Two MPI processes per node, 12 threads per process -3\. 24 MPI processes per node, 1 thread per process. +!!! note + Consider these ways to run an MPI program: + 1. One MPI process per node, 24 threads per process + 2. Two MPI processes per node, 12 threads per process + 3. 24 MPI processes per node, 1 thread per process. **One MPI** process per node, using 24 threads, is most useful for memory demanding applications, that make good use of processor cache memory and are not memory bound. This is also a preferred way for communication intensive applications as one process per node enjoys full bandwidth access to the network interface. diff --git a/docs.it4i/anselm/software/mpi/mpi4py-mpi-for-python.md b/docs.it4i/software/mpi/mpi4py-mpi-for-python.md similarity index 53% rename from docs.it4i/anselm/software/mpi/mpi4py-mpi-for-python.md rename to docs.it4i/software/mpi/mpi4py-mpi-for-python.md index 4d687dc2f61e9ae593a7900b1bf183e07e61634f..a8c964d31d2f653f41c14ca245d6453bd7d3efbe 100644 --- a/docs.it4i/anselm/software/mpi/mpi4py-mpi-for-python.md +++ b/docs.it4i/software/mpi/mpi4py-mpi-for-python.md @@ -6,7 +6,7 @@ MPI for Python provides bindings of the Message Passing Interface (MPI) standard This package is constructed on top of the MPI-1/2 specifications and provides an object oriented interface which closely follows MPI-2 C++ bindings. It supports point-to-point (sends, receives) and collective (broadcasts, scatters, gathers) communications of any picklable Python object, as well as optimized communications of Python object exposing the single-segment buffer interface (NumPy arrays, builtin bytes/string/array objects). -On Anselm MPI4Py is available in standard Python modules. +MPI4Py is available in standard Python modules on the clusters. ## Modules @@ -20,7 +20,7 @@ $ ml av Python/ Python/2.7.11-foss-2016a Python/3.5.2-foss-2016a Python/3.5.1 Python/2.7.9-foss-2015g Python/3.4.3-intel-2015b Python/2.7.9 Python/2.7.11-intel-2015b Python/3.5.2 - + $ ml av OpenMPI/ --------------------------------------- /apps/modules/mpi -------------------------- OpenMPI/1.8.6-GCC-4.4.7-system OpenMPI/1.8.8-GNU-4.9.3-2.25 OpenMPI/1.10.1-GCC-4.9.3-2.25 @@ -28,7 +28,8 @@ OpenMPI/1.8.6-GNU-5.1.0-2.25 OpenMPI/1.8.8-GNU-5.1.0-2.25 OpenMPI/1.10.1-GN OpenMPI/1.8.8-iccifort-2015.3.187-GNU-4.9.3-2.25 OpenMPI/2.0.2-GCC-6.3.0-2.27 ``` -!!! Warning "" +!!! Warning "Flavours" + * modules Python/x.x.x-intel... - intel MPI * modules Python/x.x.x-foss... - OpenMPI * modules Python/x.x.x - without MPI @@ -37,8 +38,8 @@ OpenMPI/1.8.6-GNU-5.1.0-2.25 OpenMPI/1.8.8-GNU-5.1.0-2.25 OpenMPI/1.10.1-GN You need to import MPI to your python program. Include the following line to the python script: -```cpp - from mpi4py import MPI +```python +from mpi4py import MPI ``` The MPI4Py enabled python programs [execute as any other OpenMPI](Running_OpenMPI/) code.The simpliest way is to run @@ -57,52 +58,114 @@ $ mpiexec python hello_world.py ### Hello World! -```cpp - from mpi4py import MPI +```python +from mpi4py import MPI - comm = MPI.COMM_WORLD +comm = MPI.COMM_WORLD - print "Hello! I'm rank %d from %d running in total..." % (comm.rank, comm.size) +print "Hello! I'm rank %d from %d running in total..." % (comm.rank, comm.size) - comm.Barrier() # wait for everybody to synchronize +comm.Barrier() # wait for everybody to synchronize ``` ### Collective Communication With NumPy Arrays -```cpp - from mpi4py import MPI - from __future__ import division - import numpy as np +```python +from mpi4py import MPI +from __future__ import division +import numpy as np - comm = MPI.COMM_WORLD +comm = MPI.COMM_WORLD - print("-"*78) - print(" Running on %d cores" % comm.size) - print("-"*78) +print("-"*78) +print(" Running on %d cores" % comm.size) +print("-"*78) - comm.Barrier() +comm.Barrier() - # Prepare a vector of N=5 elements to be broadcasted... - N = 5 - if comm.rank == 0: - A = np.arange(N, dtype=np.float64) # rank 0 has proper data - else: - A = np.empty(N, dtype=np.float64) # all other just an empty array +# Prepare a vector of N=5 elements to be broadcasted... +N = 5 +if comm.rank == 0: + A = np.arange(N, dtype=np.float64) # rank 0 has proper data +else: + A = np.empty(N, dtype=np.float64) # all other just an empty array - # Broadcast A from rank 0 to everybody - comm.Bcast( [A, MPI.DOUBLE] ) +# Broadcast A from rank 0 to everybody +comm.Bcast( [A, MPI.DOUBLE] ) - # Everybody should now have the same... - print "[%02d] %s" % (comm.rank, A) +# Everybody should now have the same... +print "[%02d] %s" % (comm.rank, A) ``` Execute the above code as: ```console -$ qsub -q qexp -l select=4:ncpus=16:mpiprocs=16:ompthreads=1 -I +$ qsub -q qexp -l select=4:ncpus=16:mpiprocs=16:ompthreads=1 -I # Salomon: ncpus=24:mpiprocs=24 $ ml Python $ ml OpenMPI $ mpiexec -bycore -bind-to-core python hello_world.py ``` In this example, we run MPI4Py enabled code on 4 nodes, 16 cores per node (total of 64 processes), each python process is bound to a different core. More examples and documentation can be found on [MPI for Python webpage](https://pypi.python.org/pypi/mpi4py). + +### Adding numbers + +Task: count sum of numbers from 1 to 1 000 000. (There is an easy formula to count the sum of arithmetic sequence, but we are showing the MPI solution with adding numbers one by one). + +```python +#!/usr/bin/python + +import numpy +from mpi4py import MPI +import time + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +a = 1 +b = 1000000 + +perrank = b//size +summ = numpy.zeros(1) + +comm.Barrier() +start_time = time.time() + +temp = 0 +for i in range(a + rank*perrank, a + (rank+1)*perrank): + temp = temp + i + +summ[0] = temp + +if rank == 0: + total = numpy.zeros(1) +else: + total = None + +comm.Barrier() +#collect the partial results and add to the total sum +comm.Reduce(summ, total, op=MPI.SUM, root=0) + +stop_time = time.time() + +if rank == 0: + #add the rest numbers to 1 000 000 + for i in range(a + (size)*perrank, b+1): + total[0] = total[0] + i + print ("The sum of numbers from 1 to 1 000 000: ", int(total[0])) + print ("time spent with ", size, " threads in milliseconds") + print ("-----", int((time.time()-start_time)*1000), "-----") +``` + +Execute the code above as: + +```console +$ qsub -I -q qexp -l select=4:ncpus=16,walltime=00:30:00 + +$ ml Python/3.5.2-intel-2017.00 + +$ mpirun -n 2 python myprogram.py +``` + +You can increase n and watch time lowering. diff --git a/docs.it4i/anselm/software/mpi/running-mpich2.md b/docs.it4i/software/mpi/running-mpich2.md similarity index 100% rename from docs.it4i/anselm/software/mpi/running-mpich2.md rename to docs.it4i/software/mpi/running-mpich2.md diff --git a/docs.it4i/anselm/software/numerical-languages/introduction.md b/docs.it4i/software/numerical-languages/introduction.md similarity index 97% rename from docs.it4i/anselm/software/numerical-languages/introduction.md rename to docs.it4i/software/numerical-languages/introduction.md index 8646fe6fed34038028fdab9dbcde98840d204944..92251d6dfa3fe1fecf0c3cf8510adb7607923052 100644 --- a/docs.it4i/anselm/software/numerical-languages/introduction.md +++ b/docs.it4i/software/numerical-languages/introduction.md @@ -11,7 +11,7 @@ This section contains a collection of high-level interpreted languages, primaril MATLAB® is a high-level language and interactive environment for numerical computation, visualization, and programming. ```console -$ ml MATLAB/2015b-EDU +$ ml MATLAB $ matlab ``` diff --git a/docs.it4i/salomon/software/numerical-languages/matlab.md b/docs.it4i/software/numerical-languages/matlab.md similarity index 72% rename from docs.it4i/salomon/software/numerical-languages/matlab.md rename to docs.it4i/software/numerical-languages/matlab.md index e08bf9099ee9d5175a8579afe2fc9d6d32b1aa8f..e3bccc1a9ae9f976509dea53ad6cf4b1ac11302a 100644 --- a/docs.it4i/salomon/software/numerical-languages/matlab.md +++ b/docs.it4i/software/numerical-languages/matlab.md @@ -16,14 +16,14 @@ $ ml MATLAB By default the EDU variant is marked as default. If you need other version or variant, load the particular version. To obtain the list of available versions use ```console -$ module avail MATLAB +$ ml av MATLAB ``` If you need to use the Matlab GUI to prepare your Matlab programs, you can use Matlab directly on the login nodes. But for all computations use Matlab on the compute nodes via PBS Pro scheduler. -If you require the Matlab GUI, please follow the general information about [running graphical applications](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). +If you require the Matlab GUI, please follow the general information about [running graphical applications](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). -Matlab GUI is quite slow using the X forwarding built in the PBS (qsub -X), so using X11 display redirection either via SSH or directly by xauth (please see the "GUI Applications on Compute Nodes over VNC" part [here](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/)) is recommended. +Matlab GUI is quite slow using the X forwarding built in the PBS (qsub -X), so using X11 display redirection either via SSH or directly by xauth (please see the "GUI Applications on Compute Nodes over VNC" part [here](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/)) is recommended. To run Matlab with GUI, use @@ -50,11 +50,11 @@ Delete previously used file mpiLibConf.m, we have observed crashes when using In To use Distributed Computing, you first need to setup a parallel profile. We have provided the profile for you, you can either import it in MATLAB command line: ```console - > parallel.importProfile('/apps/all/MATLAB/2015b-EDU/SalomonPBSPro.settings') +> parallel.importProfile('/apps/all/MATLAB/2015b-EDU/SalomonPBSPro.settings') - ans = +ans = - SalomonPBSPro +SalomonPBSPro ``` Or in the GUI, go to tab HOME -> Parallel -> Manage Cluster Profiles..., click Import and navigate to : @@ -63,9 +63,12 @@ Or in the GUI, go to tab HOME -> Parallel -> Manage Cluster Profiles..., click I With the new mode, MATLAB itself launches the workers via PBS, so you can either use interactive mode or a batch mode on one node, but the actual parallel processing will be done in a separate job started by MATLAB itself. Alternatively, you can use "local" mode to run parallel code on just a single node. +!!! note + The profile is confusingly named Salomon, but you can use it also on Anselm. + ### Parallel Matlab Interactive Session -Following example shows how to start interactive session with support for Matlab GUI. For more information about GUI based applications on Anselm see [this page](../../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). +Following example shows how to start interactive session with support for Matlab GUI. For more information about GUI based applications on Anselm see [this page](../../general/accessing-the-clusters/graphical-user-interface/x-window-system/). ```console $ xhost + @@ -79,8 +82,8 @@ The second part of the command shows how to request all necessary licenses. In t Once the access to compute nodes is granted by PBS, user can load following modules and start Matlab: ```console - r1i0n17$ ml MATLAB/2015a-EDU - r1i0n17$ matlab & +$ ml MATLAB/2015a-EDU +$ matlab & ``` ### Parallel Matlab Batch Job in Local Mode @@ -88,26 +91,26 @@ Once the access to compute nodes is granted by PBS, user can load following modu To run matlab in batch mode, write an matlab script, then write a bash jobscript and execute via the qsub command. By default, matlab will execute one matlab worker instance per allocated core. ```bash - #!/bin/bash - #PBS -A PROJECT ID - #PBS -q qprod - #PBS -l select=1:ncpus=24:mpiprocs=24:ompthreads=1 +#!/bin/bash +#PBS -A PROJECT ID +#PBS -q qprod +#PBS -l select=1:ncpus=24:mpiprocs=24:ompthreads=1 # Anselm: ncpus=16:mpiprocs=16 - # change to shared scratch directory - SCR=/scratch/work/user/$USER/$PBS_JOBID - mkdir -p $SCR ; cd $SCR || exit +# change to shared scratch directory +SCR=/scratch/.../$USER/$PBS_JOBID # change path in according to the cluster +mkdir -p $SCR ; cd $SCR || exit - # copy input file to scratch - cp $PBS_O_WORKDIR/matlabcode.m . +# copy input file to scratch +cp $PBS_O_WORKDIR/matlabcode.m . - # load modules - module load MATLAB/2015a-EDU +# load modules +module load MATLAB/2015a-EDU - # execute the calculation - matlab -nodisplay -r matlabcode > output.out +# execute the calculation +matlab -nodisplay -r matlabcode > output.out - # copy output file to home - cp output.out $PBS_O_WORKDIR/. +# copy output file to home +cp output.out $PBS_O_WORKDIR/. ``` This script may be submitted directly to the PBS workload manager via the qsub command. The inputs and matlab script are in matlabcode.m file, outputs in output.out file. Note the missing .m extension in the matlab -r matlabcodefile call, **the .m must not be included**. Note that the **shared /scratch must be used**. Further, it is **important to include quit** statement at the end of the matlabcode.m script. @@ -123,7 +126,7 @@ $ qsub ./jobscript The last part of the configuration is done directly in the user Matlab script before Distributed Computing Toolbox is started. ```console - cluster = parcluster('local') +cluster = parcluster('local') ``` This script creates scheduler object "cluster" of type "local" that starts workers locally. @@ -134,40 +137,40 @@ This script creates scheduler object "cluster" of type "local" that starts worke The last step is to start matlabpool with "cluster" object and correct number of workers. We have 24 cores per node, so we start 24 workers. ```console - parpool(cluster,24); +parpool(cluster,24); # Anselm: parpool(cluster,24) - ... parallel code ... +... parallel code ... - parpool close +parpool close ``` The complete example showing how to use Distributed Computing Toolbox in local mode is shown here. -```console - cluster = parcluster('local'); - cluster +```matlab +cluster = parcluster('local'); +cluster - parpool(cluster,24); +parpool(cluster,24); - n=2000; +n=2000; - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') +W = rand(n,n); +W = distributed(W); +x = (1:n)'; +x = distributed(x); +spmd +[~, name] = system('hostname') - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - T; - whos % T and W are both distributed arrays here. + T = W*x; % Calculation performed on labs, in parallel. + % T and W are both codistributed arrays here. +end +T; +whos % T and W are both distributed arrays here. - parpool close - quit +parpool close +quit ``` You can copy and paste the example in a .m file and execute. Note that the parpool size should correspond to **total number of cores** available on allocated nodes. @@ -178,30 +181,30 @@ This mode uses PBS scheduler to launch the parallel pool. It uses the SalomonPBS This is an example of m-script using PBS mode: -```console - cluster = parcluster('SalomonPBSPro'); - set(cluster, 'SubmitArguments', '-A OPEN-0-0'); - set(cluster, 'ResourceTemplate', '-q qprod -l select=10:ncpus=24'); - set(cluster, 'NumWorkers', 240); +```matlab +cluster = parcluster('SalomonPBSPro'); +set(cluster, 'SubmitArguments', '-A OPEN-0-0'); +set(cluster, 'ResourceTemplate', '-q qprod -l select=10:ncpus=24'); +set(cluster, 'NumWorkers', 240); - pool = parpool(cluster,240); +pool = parpool(cluster,240); - n=2000; +n=2000; - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') +W = rand(n,n); +W = distributed(W); +x = (1:n)'; +x = distributed(x); +spmd +[~, name] = system('hostname') - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - whos % T and W are both distributed arrays here. + T = W*x; % Calculation performed on labs, in parallel. + % T and W are both codistributed arrays here. +end +whos % T and W are both distributed arrays here. - % shut down parallel pool - delete(pool) +% shut down parallel pool +delete(pool) ``` Note that we first construct a cluster object using the imported profile, then set some important options, namely : SubmitArguments, where you need to specify accounting id, and ResourceTemplate, where you need to specify number of nodes to run the job. @@ -219,38 +222,38 @@ For this method, you need to use SalomonDirect profile, import it using [the sam This is an example of m-script using direct mode: -```console - parallel.importProfile('/apps/all/MATLAB/2015b-EDU/SalomonDirect.settings') - cluster = parcluster('SalomonDirect'); - set(cluster, 'NumWorkers', 48); +```matlab +parallel.importProfile('/apps/all/MATLAB/2015b-EDU/SalomonDirect.settings') +cluster = parcluster('SalomonDirect'); +set(cluster, 'NumWorkers', 48); - pool = parpool(cluster, 48); +pool = parpool(cluster, 48); - n=2000; +n=2000; - W = rand(n,n); - W = distributed(W); - x = (1:n)'; - x = distributed(x); - spmd - [~, name] = system('hostname') +W = rand(n,n); +W = distributed(W); +x = (1:n)'; +x = distributed(x); +spmd +[~, name] = system('hostname') - T = W*x; % Calculation performed on labs, in parallel. - % T and W are both codistributed arrays here. - end - whos % T and W are both distributed arrays here. + T = W*x; % Calculation performed on labs, in parallel. + % T and W are both codistributed arrays here. +end +whos % T and W are both distributed arrays here. - % shut down parallel pool - delete(pool) +% shut down parallel pool +delete(pool) ``` ### Non-Interactive Session and Licenses -If you want to run batch jobs with Matlab, be sure to request appropriate license features with the PBS Pro scheduler, at least the `-l __feature__matlab__MATLAB=1` for EDU variant of Matlab. More information about how to check the license features states and how to request them with PBS Pro, please [look here](../../../anselm/software/isv_licenses/). +If you want to run batch jobs with Matlab, be sure to request appropriate license features with the PBS Pro scheduler, at least the `-l __feature__matlab__MATLAB=1` for EDU variant of Matlab. More information about how to check the license features states and how to request them with PBS Pro, please [look here](../isv_licenses/). The licensing feature of PBS is currently disabled. -In case of non-interactive session please read the [following information](../../../anselm/software/isv_licenses/) on how to modify the qsub command to test for available licenses prior getting the resource allocation. +In case of non-interactive session please read the [following information](../isv_licenses/) on how to modify the qsub command to test for available licenses prior getting the resource allocation. ### Matlab Distributed Computing Engines Start Up Time @@ -275,4 +278,4 @@ Since this is a SMP machine, you can completely avoid using Parallel Toolbox and ### Local Cluster Mode -You can also use Parallel Toolbox on UV2000. Use l[ocal cluster mode](matlab/#parallel-matlab-batch-job-in-local-mode), "SalomonPBSPro" profile will not work. +You can also use Parallel Toolbox on UV2000. Use [local cluster mode](matlab/#parallel-matlab-batch-job-in-local-mode), "SalomonPBSPro" profile will not work. diff --git a/docs.it4i/anselm/software/numerical-languages/matlab_1314.md b/docs.it4i/software/numerical-languages/matlab_1314.md similarity index 97% rename from docs.it4i/anselm/software/numerical-languages/matlab_1314.md rename to docs.it4i/software/numerical-languages/matlab_1314.md index 41dca05619875b20806beb1a8dde7c255347bd89..1c2d29d3a3053d9a7bec0c5fc777fb024f0be369 100644 --- a/docs.it4i/anselm/software/numerical-languages/matlab_1314.md +++ b/docs.it4i/software/numerical-languages/matlab_1314.md @@ -46,11 +46,11 @@ Plots, images, etc... will be still available. Recommended parallel mode for running parallel Matlab on Anselm is MPIEXEC mode. In this mode user allocates resources through PBS prior to starting Matlab. Once resources are granted the main Matlab instance is started on the first compute node assigned to job by PBS and workers are started on all remaining nodes. User can use both interactive and non-interactive PBS sessions. This mode guarantees that the data processing is not performed on login nodes, but all processing is on compute nodes. - + For the performance reasons Matlab should use system MPI. On Anselm the supported MPI implementation for Matlab is Intel MPI. To switch to system MPI user has to override default Matlab setting by creating new configuration file in its home directory. The path and file name has to be exactly the same as in the following listing: -```console +```matlab $ vim ~/matlab/mpiLibConf.m function [lib, extras] = mpiLibConf @@ -88,9 +88,9 @@ The second part of the command shows how to request all necessary licenses. In t Once the access to compute nodes is granted by PBS, user can load following modules and start Matlab: ```console - cn79$ ml matlab/R2013a-EDU - cn79$ ml impi/4.1.1.036 - cn79$ matlab & +$ ml matlab/R2013a-EDU +$ ml impi/4.1.1.036 +$ matlab & ``` ### Parallel Matlab Batch Job @@ -133,7 +133,7 @@ $ qsub ./jobscript The last part of the configuration is done directly in the user Matlab script before Distributed Computing Toolbox is started. -```console +```matlab sched = findResource('scheduler', 'type', 'mpiexec'); set(sched, 'MpiexecFileName', '/apps/intel/impi/4.1.1/bin/mpirun'); set(sched, 'EnvironmentSetMethod', 'setenv'); @@ -158,7 +158,7 @@ matlabpool close The complete example showing how to use Distributed Computing Toolbox is show here. -```console +```matlab sched = findResource('scheduler', 'type', 'mpiexec'); set(sched, 'MpiexecFileName', '/apps/intel/impi/4.1.1/bin/mpirun') set(sched, 'EnvironmentSetMethod', 'setenv') diff --git a/docs.it4i/anselm/software/numerical-languages/octave.md b/docs.it4i/software/numerical-languages/octave.md similarity index 71% rename from docs.it4i/anselm/software/numerical-languages/octave.md rename to docs.it4i/software/numerical-languages/octave.md index 4fbb52979a38da23ec3a9a3c93e456383f99ab22..ca785e75dca4e83cccbdf25b68800363f33a841b 100644 --- a/docs.it4i/anselm/software/numerical-languages/octave.md +++ b/docs.it4i/software/numerical-languages/octave.md @@ -4,13 +4,11 @@ GNU Octave is a high-level interpreted language, primarily intended for numerical computations. It provides capabilities for the numerical solution of linear and nonlinear problems, and for performing other numerical experiments. It also provides extensive graphics capabilities for data visualization and manipulation. Octave is normally used through its interactive command line interface, but it can also be used to write non-interactive programs. The Octave language is quite similar to Matlab so that most programs are easily portable. Read more on <http://www.gnu.org/software/octave/> -Two versions of octave are available on Anselm, via module +For looking for avaible modules, type: -| Version | module | -| ----------------------------------------------------- | ------------------------- | -| Octave 3.8.2, compiled with GCC and Multithreaded MKL | Octave/3.8.2-gimkl-2.11.5 | -| Octave 4.0.1, compiled with GCC and Multithreaded MKL | Octave/4.0.1-gimkl-2.11.5 | -| Octave 4.0.0, compiled with >GCC and OpenBLAS | Octave/4.0.0-foss-2015g | +```console +$ ml av octave +``` ## Modules and Execution @@ -18,7 +16,7 @@ Two versions of octave are available on Anselm, via module $ ml Octave ``` -The octave on Anselm is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many octave kernels, notably the linear algebra subroutines. Octave runs these heavy calculation kernels without any penalty. By default, octave would parallelize to 16 threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. +The octave on clusters is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many octave kernels, notably the linear algebra subroutines. Octave runs these heavy calculation kernels without any penalty. By default, octave would parallelize to 16 (Anselm) or 24 (Salomon) threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. To run octave interactively, log in with ssh -X parameter for X11 forwarding. Run octave: @@ -26,31 +24,31 @@ To run octave interactively, log in with ssh -X parameter for X11 forwarding. Ru $ octave ``` -To run octave in batch mode, write an octave script, then write a bash jobscript and execute via the qsub command. By default, octave will use 16 threads when running MKL kernels. +To run octave in batch mode, write an octave script, then write a bash jobscript and execute via the qsub command. By default, octave will use 16 (Anselm) or 24 (Salomon) threads when running MKL kernels. ```bash - #!/bin/bash +#!/bin/bash - # change to local scratch directory - cd /lscratch/$PBS_JOBID || exit +# change to local scratch directory +cd /lscratch/$PBS_JOBID || exit - # copy input file to scratch - cp $PBS_O_WORKDIR/octcode.m . +# copy input file to scratch +cp $PBS_O_WORKDIR/octcode.m . - # load octave module - module load octave +# load octave module +module load octave - # execute the calculation - octave -q --eval octcode > output.out +# execute the calculation +octave -q --eval octcode > output.out - # copy output file to home - cp output.out $PBS_O_WORKDIR/. +# copy output file to home +cp output.out $PBS_O_WORKDIR/. - #exit - exit +#exit +exit ``` -This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in octcode.m file, outputs in output.out file. See the single node jobscript example in the [Job execution section](../../job-submission-and-execution/). +This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in octcode.m file, outputs in output.out file. See the single node jobscript example in the [Job execution section](../../salomon/job-submission-and-execution/). The octave c compiler mkoctfile calls the GNU gcc 4.8.1 for compiling native c code. This is very useful for running native c subroutines in octave environment. @@ -62,7 +60,7 @@ Octave may use MPI for interprocess communication This functionality is currentl ## Xeon Phi Support -Octave may take advantage of the Xeon Phi accelerators. This will only work on the [Intel Xeon Phi](../intel-xeon-phi/) [accelerated nodes](../../compute-nodes/). +Octave may take advantage of the Xeon Phi accelerators. This will only work on the [Intel Xeon Phi](../intel-xeon-phi/) [accelerated nodes](../../salomon/compute-nodes/). ### Automatic Offload Support @@ -70,7 +68,7 @@ Octave can accelerate BLAS type operations (in particular the Matrix Matrix mult Example -```console +```octave $ export OFFLOAD_REPORT=2 $ export MKL_MIC_ENABLE=1 $ ml octave diff --git a/docs.it4i/salomon/software/numerical-languages/opencoarrays.md b/docs.it4i/software/numerical-languages/opencoarrays.md similarity index 79% rename from docs.it4i/salomon/software/numerical-languages/opencoarrays.md rename to docs.it4i/software/numerical-languages/opencoarrays.md index f573d400a1991183d99dbfba2f802fba6fd4dd31..55ca845b581b2ed4f07c4431440be56552438b7d 100644 --- a/docs.it4i/salomon/software/numerical-languages/opencoarrays.md +++ b/docs.it4i/software/numerical-languages/opencoarrays.md @@ -20,11 +20,12 @@ Read more on <http://www.opencoarrays.org/> Indexing of individual images can be shown on the simple *Hello World* program: ```fortran - program hello_world - implicit none - print *, 'Hello world from image ', this_image() , 'of', num_images() - end program hello_world +program hello_world + implicit none + print *, 'Hello world from image ', this_image() , 'of', num_images() +end program hello_world ``` + * num_images() - returns the number of all images * this_image() - returns the image index - numbered from 1 to num_images() @@ -34,10 +35,10 @@ Coarray variables can be declared with the **codimension[*]** attribute or by ad Notice, the ***** character always has to be in the square brackets. ```fortran - integer, codimension[*] :: scalar - integer :: scalar[*] - real, dimension(64), codimension[*] :: vector - real :: vector(64)[*] +integer, codimension[*] :: scalar +integer :: scalar[*] +real, dimension(64), codimension[*] :: vector +real :: vector(64)[*] ``` ### Images Synchronization @@ -48,24 +49,24 @@ Synchronization can be done across all images or only between selected images. B Example program: ```fortran - program synchronization_test - implicit none - integer :: i ! Local variable - integer :: numbers[*] ! Scalar coarray - - ! Genereate random number on image 1 - if (this_image() == 1) then - numbers = floor(rand(1) * 1000) - ! Distribute information to other images - do i = 2, num_images() - numbers[i] = numbers - end do - end if - - sync all ! Barrier to synchronize all images - - print *, 'The random number is', numbers - end program synchronization_test +program synchronization_test + implicit none + integer :: i ! Local variable + integer :: numbers[*] ! Scalar coarray + + ! Genereate random number on image 1 + if (this_image() == 1) then + numbers = floor(rand(1) * 1000) + ! Distribute information to other images + do i = 2, num_images() + numbers[i] = numbers + end do + end if + + sync all ! Barrier to synchronize all images + + print *, 'The random number is', numbers +end program synchronization_test ``` * sync all - Synchronize all images between each other @@ -73,7 +74,7 @@ Example program: * sync images(*index*) - Synchronize this image to image with *index* !!! note - **number** is the local variable while **number[*index*]** accesses the variable in a specific image. + **number** is the local variable while **number[*index*]** accesses the variable in a specific image. **number[this_image()]** is the same as **number**. ## Compile and Run @@ -94,7 +95,7 @@ $ caf hello_world.f90 -o hello_world.x ``` !!! warning - The input file extension **.f90** or **.F90** are to be interpreted as *Fortran 90*. + The input file extension **.f90** or **.F90** are to be interpreted as *Fortran 90*. If the input file extension is **.f** or **.F** the source code will be interpreted as *Fortran 77*. Another method for compiling is by invoking the *mpif90* compiler wrapper directly: @@ -103,7 +104,6 @@ Another method for compiling is by invoking the *mpif90* compiler wrapper direct $ mpif90 hello_world.f90 -o hello_world.x -fcoarray=lib -lcaf_mpi ``` - ### Run CAF Program A CAF program can be run by invoking the *cafrun* wrapper or directly by the *mpiexec*: @@ -124,4 +124,4 @@ $ mpiexec -np 4 ./synchronization_test.x **-np 4** is number of images to run. The parameters of **cafrun** and **mpiexec** are the same. -For more information about running CAF program please follow [Running OpenMPI](../mpi/Running_OpenMPI.md) +For more information about running CAF program please follow [Running OpenMPI - Salomon](../mpi/Running_OpenMPI.md) diff --git a/docs.it4i/salomon/software/numerical-languages/r.md b/docs.it4i/software/numerical-languages/r.md similarity index 78% rename from docs.it4i/salomon/software/numerical-languages/r.md rename to docs.it4i/software/numerical-languages/r.md index 6df515adad043a581ce3da7855737194b1c250ae..3322a89acbf62cde753cfc57adf36a001d986148 100644 --- a/docs.it4i/salomon/software/numerical-languages/r.md +++ b/docs.it4i/software/numerical-languages/r.md @@ -27,7 +27,7 @@ $ ml R ## Execution -The R on Anselm is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many R kernels, notably the linear algebra subroutines. The R runs these heavy calculation kernels without any penalty. By default, the R would parallelize to 24 threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. +The R on cluster is linked to highly optimized MKL mathematical library. This provides threaded parallelization to many R kernels, notably the linear algebra subroutines. The R runs these heavy calculation kernels without any penalty. By default, the R would parallelize to 24 (Salomon) or 16 (Anselm) threads. You may control the threads by setting the OMP_NUM_THREADS environment variable. ### Interactive Execution @@ -40,7 +40,7 @@ $ rstudio ### Batch Execution -To run R in batch mode, write an R script, then write a bash jobscript and execute via the qsub command. By default, R will use 24 threads when running MKL kernels. +To run R in batch mode, write an R script, then write a bash jobscript and execute via the qsub command. By default, R will use 24 (Salomon) or 16 (Anselm) threads when running MKL kernels. Example jobscript: @@ -66,7 +66,7 @@ cp routput.out $PBS_O_WORKDIR/. exit ``` -This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in rscript.R file, outputs in routput.out file. See the single node jobscript example in the [Job execution section](../../job-submission-and-execution/). +This script may be submitted directly to the PBS workload manager via the qsub command. The inputs are in rscript.R file, outputs in routput.out file. See the single node jobscript example in the [Job execution section - Anselm](../../anselm/job-submission-and-execution/). ## Parallel R @@ -166,47 +166,47 @@ Static Rmpi programs are executed via mpiexec, as any other MPI programs. Number Static Rmpi example: ```r - library(Rmpi) +library(Rmpi) - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } +#integrand function +f <- function(i,h) { +x <- h*(i-0.5) +return (4/(1 + x*x)) +} - #initialize - invisible(mpi.comm.dup(0,1)) - rank <- mpi.comm.rank() - size <- mpi.comm.size() - n<-0 +#initialize +invisible(mpi.comm.dup(0,1)) +rank <- mpi.comm.rank() +size <- mpi.comm.size() +n<-0 - while (TRUE) - { - #read number of intervals - if (rank==0) { - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - } +while (TRUE) +{ + #read number of intervals + if (rank==0) { + cat("Enter the number of intervals: (0 quits) ") + fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) + } - #broadcat the intervals - n <- mpi.bcast(as.integer(n),type=1) + #broadcat the intervals + n <- mpi.bcast(as.integer(n),type=1) - if(n<=0) break + if(n<=0) break - #run the calculation - n <- max(n,size) - h <- 1.0/n + #run the calculation + n <- max(n,size) + h <- 1.0/n - i <- seq(rank+1,n,size); - mypi <- h*sum(sapply(i,f,h)); + i <- seq(rank+1,n,size); + mypi <- h*sum(sapply(i,f,h)); - pi3 <- mpi.reduce(mypi) + pi3 <- mpi.reduce(mypi) - #print results - if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } + #print results + if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) +} - mpi.quit() +mpi.quit() ``` The above is the static MPI example for calculating the number Ď€. Note the **library(Rmpi)** and **mpi.comm.dup()** function calls. Execute the example as: @@ -222,61 +222,61 @@ Dynamic Rmpi programs are executed by calling the R directly. OpenMPI module mus Dynamic Rmpi example: ```r - #integrand function - f <- function(i,h) { - x <- h*(i-0.5) - return (4/(1 + x*x)) - } +#integrand function +f <- function(i,h) { +x <- h*(i-0.5) +return (4/(1 + x*x)) +} - #the worker function - workerpi <- function() - { - #initialize - rank <- mpi.comm.rank() - size <- mpi.comm.size() - n<-0 +#the worker function +workerpi <- function() +{ +#initialize +rank <- mpi.comm.rank() +size <- mpi.comm.size() +n<-0 - while (TRUE) - { - #read number of intervals - if (rank==0) { - cat("Enter the number of intervals: (0 quits) ") - fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) - } +while (TRUE) +{ + #read number of intervals + if (rank==0) { + cat("Enter the number of intervals: (0 quits) ") + fp<-file("stdin"); n<-scan(fp,nmax=1); close(fp) + } - #broadcat the intervals - n <- mpi.bcast(as.integer(n),type=1) + #broadcat the intervals + n <- mpi.bcast(as.integer(n),type=1) - if(n<=0) break + if(n<=0) break - #run the calculation - n <- max(n,size) - h <- 1.0/n + #run the calculation + n <- max(n,size) + h <- 1.0/n - i <- seq(rank+1,n,size); - mypi <- h*sum(sapply(i,f,h)); + i <- seq(rank+1,n,size); + mypi <- h*sum(sapply(i,f,h)); - pi3 <- mpi.reduce(mypi) + pi3 <- mpi.reduce(mypi) - #print results - if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) - } - } + #print results + if (rank==0) cat(sprintf("Value of PI %16.14f, diff= %16.14fn",pi3,pi3-pi)) +} +} - #main - library(Rmpi) +#main +library(Rmpi) - cat("Enter the number of slaves: ") - fp<-file("stdin"); ns<-scan(fp,nmax=1); close(fp) +cat("Enter the number of slaves: ") +fp<-file("stdin"); ns<-scan(fp,nmax=1); close(fp) - mpi.spawn.Rslaves(nslaves=ns) - mpi.bcast.Robj2slave(f) - mpi.bcast.Robj2slave(workerpi) +mpi.spawn.Rslaves(nslaves=ns) +mpi.bcast.Robj2slave(f) +mpi.bcast.Robj2slave(workerpi) - mpi.bcast.cmd(workerpi()) - workerpi() +mpi.bcast.cmd(workerpi()) +workerpi() - mpi.quit() +mpi.quit() ``` The above example is the dynamic MPI example for calculating the number Ď€. Both master and slave processes carry out the calculation. Note the mpi.spawn.Rslaves(), mpi.bcast.Robj2slave()** and the mpi.bcast.cmd()** function calls. @@ -369,10 +369,10 @@ Example jobscript for [static Rmpi](r/#static-rmpi) parallel R execution, runnin #!/bin/bash #PBS -q qprod #PBS -N Rjob -#PBS -l select=100:ncpus=24:mpiprocs=24:ompthreads=1 +#PBS -l select=100:ncpus=24:mpiprocs=24:ompthreads=1 # Anselm: ncpus=16:mpiprocs=16 # change to scratch directory -SCRDIR=/scratch/work/user/$USER/myjob +SCRDIR=/scratch/work/user/$USER/myjob # Anselm: SCRDIR=/scratch/$USER/myjob cd $SCRDIR || exit # copy input file to scratch @@ -392,7 +392,7 @@ cp routput.out $PBS_O_WORKDIR/. exit ``` -For more information about jobscripts and MPI execution refer to the [Job submission](../../job-submission-and-execution/) and general [MPI](../mpi/mpi/) sections. +For more information about jobscripts and MPI execution refer to the [Job submission](../../anselm/job-submission-and-execution/) and general [MPI](../mpi/mpi/) sections. ## Xeon Phi Offload diff --git a/docs.it4i/anselm/software/numerical-libraries/fftw.md b/docs.it4i/software/numerical-libraries/fftw.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/fftw.md rename to docs.it4i/software/numerical-libraries/fftw.md diff --git a/docs.it4i/anselm/software/numerical-libraries/gsl.md b/docs.it4i/software/numerical-libraries/gsl.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/gsl.md rename to docs.it4i/software/numerical-libraries/gsl.md diff --git a/docs.it4i/anselm/software/numerical-libraries/hdf5.md b/docs.it4i/software/numerical-libraries/hdf5.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/hdf5.md rename to docs.it4i/software/numerical-libraries/hdf5.md diff --git a/docs.it4i/anselm/software/numerical-libraries/intel-numerical-libraries.md b/docs.it4i/software/numerical-libraries/intel-numerical-libraries.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/intel-numerical-libraries.md rename to docs.it4i/software/numerical-libraries/intel-numerical-libraries.md diff --git a/docs.it4i/anselm/software/numerical-libraries/magma-for-intel-xeon-phi.md b/docs.it4i/software/numerical-libraries/magma-for-intel-xeon-phi.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/magma-for-intel-xeon-phi.md rename to docs.it4i/software/numerical-libraries/magma-for-intel-xeon-phi.md diff --git a/docs.it4i/anselm/software/numerical-libraries/petsc.md b/docs.it4i/software/numerical-libraries/petsc.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/petsc.md rename to docs.it4i/software/numerical-libraries/petsc.md diff --git a/docs.it4i/anselm/software/numerical-libraries/trilinos.md b/docs.it4i/software/numerical-libraries/trilinos.md similarity index 100% rename from docs.it4i/anselm/software/numerical-libraries/trilinos.md rename to docs.it4i/software/numerical-libraries/trilinos.md diff --git a/docs.it4i/anselm/software/omics-master/diagnostic-component-team.md b/docs.it4i/software/omics-master/diagnostic-component-team.md similarity index 97% rename from docs.it4i/anselm/software/omics-master/diagnostic-component-team.md rename to docs.it4i/software/omics-master/diagnostic-component-team.md index d8d0c4fc4e26a25550cb96b6dbe16a7a587fecf5..24dc717781a881901310c127739d1e873d151a6b 100644 --- a/docs.it4i/anselm/software/omics-master/diagnostic-component-team.md +++ b/docs.it4i/software/omics-master/diagnostic-component-team.md @@ -13,6 +13,6 @@ VCF files are scanned by this diagnostic tool for known diagnostic disease-assoc TEAM (27) is an intuitive and easy-to-use web tool that fills the gap between the predicted mutations and the final diagnostic in targeted enrichment sequencing analysis. The tool searches for known diagnostic mutations, corresponding to a disease panel, among the predicted patient’s variants. Diagnostic variants for the disease are taken from four databases of disease-related variants (HGMD-public, HUMSAVAR , ClinVar and COSMIC) If no primary diagnostic variant is found, then a list of secondary findings that can help to establish a diagnostic is produced. TEAM also provides with an interface for the definition of and customization of panels, by means of which, genes and mutations can be added or discarded to adjust panel definitions. - + ** Figure 5. **Interface of the application. Panels for defining targeted regions of interest can be set up by just drag and drop known disease genes or disease definitions from the lists. Thus, virtual panels can be interactively improved as the knowledge of the disease increases. diff --git a/docs.it4i/anselm/software/omics-master/overview.md b/docs.it4i/software/omics-master/overview.md similarity index 98% rename from docs.it4i/anselm/software/omics-master/overview.md rename to docs.it4i/software/omics-master/overview.md index d09a0030cf06246720287c6d0ffad4bfd11825a6..e29f1daec829dd7af8a93409314a2caef755625d 100644 --- a/docs.it4i/anselm/software/omics-master/overview.md +++ b/docs.it4i/software/omics-master/overview.md @@ -9,7 +9,7 @@ The scope of this OMICS MASTER solution is restricted to human genomics research The pipeline inputs the raw data produced by the sequencing machines and undergoes a processing procedure that consists on a quality control, the mapping and variant calling steps that result in a file containing the set of variants in the sample. From this point, the prioritization component or the diagnostic component can be launched.  +them, depending of the experimental design carried out.](../../img/fig1.png) Figure 1. OMICS MASTER solution overview. Data is produced in the external labs and comes to IT4I (represented by the blue dashed line). The data pre-processor converts raw data into a list of variants and annotations for each sequenced patient. These lists files together with primary and secondary (alignment) data files are stored in IT4I sequence DB and uploaded to the discovery (candidate prioritization) or diagnostic component where they can be analyzed directly by the user that produced them, depending of the experimental design carried out. @@ -41,7 +41,7 @@ Output: FASTQ file plus an HTML file containing statistics on the data. FASTQ format It represents the nucleotide sequence and its corresponding quality scores. - + Figure 2.FASTQ file. #### Mapping @@ -81,7 +81,7 @@ corresponding information is unavailable. The standard CIGAR description of pairwise alignment defines three operations: â€M’ for match/mismatch, â€I’ for insertion compared with the reference and â€D’ for deletion. The extended CIGAR proposed in SAM added four more operations: â€N’ for skipped bases on the reference, â€S’ for soft clipping, â€H’ for hard clipping and â€P’ for padding. These support splicing, clipping, multi-part and padded alignments. Figure 3 shows examples of CIGAR strings for different types of alignments. - + Figure 3 . SAM format file. The â€@SQ’ line in the header section gives the order of reference sequences. Notably, r001 is the name of a read pair. According to FLAG 163 (=1+2+32+128), the read mapped to position 7 is the second read in the pair (128) and regarded as properly paired (1 + 2); its mate is mapped to 37 on the reverse strand (32). Read r002 has three soft-clipped (unaligned) bases. The coordinate shown in SAM is the position of the first aligned base. The CIGAR string for this alignment contains a P (padding) operation which correctly aligns the inserted sequences. Padding operations can be absent when an aligner does not support multiple sequence alignment. The last six bases of read r003 map to position 9, and the first five to position 29 on the reverse strand. The hard clipping operation H indicates that the clipped sequence is not present in the sequence field. The NM tag gives the number of mismatches. Read r004 is aligned across an intron, indicated by the N operation. @@ -125,7 +125,7 @@ A VCF file consists of a header section and a data section. The header contains  +two bases by another base (SAMPLE2); the second line shows a SNP and an insertion; the third a SNP; the fourth a large structural variant described by the annotation in the INFO column, the coordinate is that of the base before the variant. (b–f ) Alignments and VCF representations of different sequence variants: SNP, insertion, deletion, replacement, and a large deletion. The REF columns shows the reference bases replaced by the haplotype in the ALT column. The coordinate refers to the first reference base. (g) Users are advised to use simplest representation possible and lowest coordinate in cases where the position is ambiguous.](../../img/fig4.png) Figure 4 . (a) Example of valid VCF. The header lines ##fileformat and #CHROM are mandatory, the rest is optional but strongly recommended. Each line of the body describes variants present in the sampled population at one genomic position or region. All alternate alleles are listed in the ALT column and referenced from the genotype fields as 1-based indexes to this list; the reference haplotype is designated as 0. For multiploid data, the separator indicates whether the data are phased (|) or unphased (/). Thus, the two alleles C and G at the positions 2 and 5 in this figure occur on the same chromosome in SAMPLE1. The first data line shows an example of a deletion (present in SAMPLE1) and a replacement of two bases by another base (SAMPLE2); the second line shows a SNP and an insertion; the third a SNP; the fourth a large structural variant described by the annotation in the INFO column, the coordinate is that of the base before the variant. (b–f ) Alignments and VCF representations of different sequence variants: SNP, insertion, deletion, replacement, and a large deletion. The REF columns shows the reference bases replaced by the haplotype in the ALT column. The coordinate refers to the first reference base. (g) Users are advised to use simplest representation possible and lowest coordinate in cases where the position is ambiguous. @@ -231,7 +231,7 @@ second one. --project>. Project ID of your supercomputer allocation. - --queue. [Queue](../../resources-allocation-policy/) to run the jobs in. + --queue. [Queue](../../salomon/resources-allocation-policy/) to run the jobs in. ``` Input, output and ped arguments are mandatory. If the output folder does not exist, the pipeline will create it. @@ -264,7 +264,7 @@ The ped file ( file.ped) contains the following info: FAM sample_B 0 0 2 2 ``` -Now, lets load the NGSPipeline module and copy the sample data to a [scratch directory](../../storage/storage/): +Now, lets load the NGSPipeline module and copy the sample data to a [scratch directory](../../salomon/storage/): ```console $ ml ngsPipeline @@ -278,7 +278,7 @@ Now, we can launch the pipeline (replace OPEN-0-0 with your Project ID): $ ngsPipeline -i /scratch/$USER/omics/sample_data/data -o /scratch/$USER/omics/results -p /scratch/$USER/omics/sample_data/data/file.ped --project OPEN-0-0 --queue qprod ``` -This command submits the processing [jobs to the queue](../../job-submission-and-execution/). +This command submits the processing [jobs to the queue](../../salomon/job-submission-and-execution/). If we want to re-launch the pipeline from stage 4 until stage 20 we should use the next command: @@ -336,25 +336,25 @@ This listing show which tools are used in each step of the pipeline The output folder contains all the subfolders with the intermediate data. This folder contains the final VCF with all the variants. This file can be uploaded into [TEAM](diagnostic-component-team/) by using the VCF file button. It is important to note here that the entire management of the VCF file is local: no patient’s sequence data is sent over the Internet thus avoiding any problem of data privacy or confidentiality. -![TEAM upload panel. Once the file has been uploaded, a panel must be chosen from the Panel list. Then, pressing the Run button the diagnostic process starts.]\((../../../img/fig7.png) +![TEAM upload panel. Once the file has been uploaded, a panel must be chosen from the Panel list. Then, pressing the Run button the diagnostic process starts.]\((../../img/fig7.png) Figure 7. _TEAM upload panel._ _Once the file has been uploaded, a panel must be chosen from the Panel_ list. Then, pressing the Run button the diagnostic process starts. Once the file has been uploaded, a panel must be chosen from the Panel list. Then, pressing the Run button the diagnostic process starts. TEAM searches first for known diagnostic mutation(s) taken from four databases: HGMD-public (20), [HUMSAVAR](http://www.uniprot.org/docs/humsavar), ClinVar (29) and COSMIC (23). - + Figure 7. The panel manager. The elements used to define a panel are ( A ) disease terms, ( B ) diagnostic mutations and ( C ) genes. Arrows represent actions that can be taken in the panel manager. Panels can be defined by using the known mutations and genes of a particular disease. This can be done by dragging them to the Primary Diagnostic box (action D ). This action, in addition to defining the diseases in the Primary Diagnostic box, automatically adds the corresponding genes to the Genes box. The panels can be customized by adding new genes (action F ) or removing undesired genes (action G). New disease mutations can be added independently or associated to an already existing disease term (action E ). Disease terms can be removed by simply dragging them back (action H ). For variant discovering/filtering we should upload the VCF file into BierApp by using the following form: -\\ +\\ Figure 8 . \BierApp VCF upload panel. It is recommended to choose a name for the job as well as a description \\. Each prioritization (â€job’) has three associated screens that facilitate the filtering steps. The first one, the â€Summary’ tab, displays a statistic of the data set analyzed, containing the samples analyzed, the number and types of variants found and its distribution according to consequence types. The second screen, in the â€Variants and effect’ tab, is the actual filtering tool, and the third one, the â€Genome view’ tab, offers a representation of the selected variants within the genomic context provided by an embedded version of the Genome Maps Tool (30). - + Figure 9 . This picture shows all the information associated to the variants. If a variant has an associated phenotype we could see it in the last column. In this case, the variant 7:132481242 CT is associated to the phenotype: large intestine tumor. diff --git a/docs.it4i/anselm/software/omics-master/priorization-component-bierapp.md b/docs.it4i/software/omics-master/priorization-component-bierapp.md similarity index 97% rename from docs.it4i/anselm/software/omics-master/priorization-component-bierapp.md rename to docs.it4i/software/omics-master/priorization-component-bierapp.md index 6f88fecc7b92de4bd29a6e022902cb06dbbf1300..07c763fb6db2a6f31c760993b1a094a0e97ee7ff 100644 --- a/docs.it4i/anselm/software/omics-master/priorization-component-bierapp.md +++ b/docs.it4i/software/omics-master/priorization-component-bierapp.md @@ -13,7 +13,7 @@ BiERapp is available at the [following address](http://omics.it4i.cz/bierapp/) BiERapp (28) efficiently helps in the identification of causative variants in family and sporadic genetic diseases. The program reads lists of predicted variants (nucleotide substitutions and indels) in affected individuals or tumor samples and controls. In family studies, different modes of inheritance can easily be defined to filter out variants that do not segregate with the disease along the family. Moreover, BiERapp integrates additional information such as allelic frequencies in the general population and the most popular damaging scores to further narrow down the number of putative variants in successive filtering steps. BiERapp provides an interactive and user-friendly interface that implements the filtering strategy used in the context of a large-scale genomic project carried out by the Spanish Network for Research, in Rare Diseases (CIBERER) and the Medical Genome Project. in which more than 800 exomes have been analyzed. - + ** Figure 6 **. Web interface to the prioritization tool. This figure shows the interface of the web tool for candidate gene prioritization with the filters available. The tool includes a genomic viewer (Genome Maps 30) that enables the representation of the variants in the corresponding genomic coordinates. diff --git a/docs.it4i/anselm/software/openfoam.md b/docs.it4i/software/openfoam.md similarity index 76% rename from docs.it4i/anselm/software/openfoam.md rename to docs.it4i/software/openfoam.md index 865f054d326d17591cf623d0ed9d492d342e01ed..27aefea264ca2414f8abde9cb734896ac1255faa 100644 --- a/docs.it4i/anselm/software/openfoam.md +++ b/docs.it4i/software/openfoam.md @@ -45,7 +45,7 @@ In /opt/modules/modulefiles/engineering you can see installed engineering softwa lsdyna/7.x.x openfoam/2.2.1-gcc481-openmpi1.6.5-SP ``` -For information how to use modules please [look here](../environment-and-modules/). +For information how to use modules please [look here](../anselm/environment-and-modules/). ## Getting Started @@ -92,26 +92,26 @@ Now you can run the first case for example incompressible laminar flow in a cavi Create a Bash script test.sh ```bash - #!/bin/bash - module load openfoam/2.2.1-icc-openmpi1.6.5-DP - source $FOAM_BASHRC +#!/bin/bash +module load openfoam/2.2.1-icc-openmpi1.6.5-DP +source $FOAM_BASHRC - # source to run functions - . $WM_PROJECT_DIR/bin/tools/RunFunctions +# source to run functions +. $WM_PROJECT_DIR/bin/tools/RunFunctions - cd $FOAM_RUN/tutorials/incompressible/icoFoam/cavity +cd $FOAM_RUN/tutorials/incompressible/icoFoam/cavity - runApplication blockMesh - runApplication icoFoam +runApplication blockMesh +runApplication icoFoam ``` -Job submission +Job submission (example for Anselm): ```console $ qsub -A OPEN-0-0 -q qprod -l select=1:ncpus=16,walltime=03:00:00 test.sh ``` -For information about job submission please [look here](../job-submission-and-execution/). +For information about job submission please [look here](../anselm/job-submission-and-execution/). ## Running Applications in Parallel @@ -123,17 +123,17 @@ First we must run serial application bockMesh and decomposePar for preparation o Create a Bash scrip test.sh: ```bash - #!/bin/bash - module load openfoam/2.2.1-icc-openmpi1.6.5-DP - source $FOAM_BASHRC +#!/bin/bash +module load openfoam/2.2.1-icc-openmpi1.6.5-DP +source $FOAM_BASHRC - # source to run functions - . $WM_PROJECT_DIR/bin/tools/RunFunctions +# source to run functions +. $WM_PROJECT_DIR/bin/tools/RunFunctions - cd $FOAM_RUN/tutorials/incompressible/simpleFoam/motorBike +cd $FOAM_RUN/tutorials/incompressible/simpleFoam/motorBike - runApplication blockMesh - runApplication decomposePar +runApplication blockMesh +runApplication decomposePar ``` Job submission @@ -148,25 +148,25 @@ This job create simple block mesh and domain decomposition. Check your decomposi Create a PBS script testParallel.pbs: ```bash - #!/bin/bash - #PBS -N motorBike - #PBS -l select=2:ncpus=16 - #PBS -l walltime=01:00:00 - #PBS -q qprod - #PBS -A OPEN-0-0 +#!/bin/bash +#PBS -N motorBike +#PBS -l select=2:ncpus=16 +#PBS -l walltime=01:00:00 +#PBS -q qprod +#PBS -A OPEN-0-0 - module load openfoam/2.2.1-icc-openmpi1.6.5-DP - source $FOAM_BASHRC +module load openfoam/2.2.1-icc-openmpi1.6.5-DP +source $FOAM_BASHRC - cd $FOAM_RUN/tutorials/incompressible/simpleFoam/motorBike +cd $FOAM_RUN/tutorials/incompressible/simpleFoam/motorBike - nproc = 32 +nproc = 32 - mpirun -hostfile ${PBS_NODEFILE} -np $nproc snappyHexMesh -overwrite -parallel | tee snappyHexMesh.log +mpirun -hostfile ${PBS_NODEFILE} -np $nproc snappyHexMesh -overwrite -parallel | tee snappyHexMesh.log - mpirun -hostfile ${PBS_NODEFILE} -np $nproc potentialFoam -noFunctionObject-writep -parallel | tee potentialFoam.log +mpirun -hostfile ${PBS_NODEFILE} -np $nproc potentialFoam -noFunctionObject-writep -parallel | tee potentialFoam.log - mpirun -hostfile ${PBS_NODEFILE} -np $nproc simpleFoam -parallel | tee simpleFoam.log +mpirun -hostfile ${PBS_NODEFILE} -np $nproc simpleFoam -parallel | tee simpleFoam.log ``` nproc – number of subdomains diff --git a/docs.it4i/salomon/software/operating-system.md b/docs.it4i/software/operating-system.md similarity index 60% rename from docs.it4i/salomon/software/operating-system.md rename to docs.it4i/software/operating-system.md index f68a9a97aac216dd727e0973d3ac56754726b90a..5f022fdff9bc624befd9f0082ae38f675c52e5ff 100644 --- a/docs.it4i/salomon/software/operating-system.md +++ b/docs.it4i/software/operating-system.md @@ -1,4 +1,8 @@ -# Operating System +# Anselm + +The operating system on Anselm is Linux - [**Red Hat Enterprise Linux release 6.x**](https://en.wikipedia.org/wiki/Red_Hat_Enterprise_Linux). + +# Salomon The operating system on Salomon is Linux - [**CentOS 6.x**](https://en.wikipedia.org/wiki/CentOS) diff --git a/docs.it4i/salomon/software/paraview.md b/docs.it4i/software/paraview.md similarity index 69% rename from docs.it4i/salomon/software/paraview.md rename to docs.it4i/software/paraview.md index ce52d69cfa423588bc43361249a19261197e34e5..7e2bae9a95bc33c6f83756188a5c1c54e4037892 100644 --- a/docs.it4i/salomon/software/paraview.md +++ b/docs.it4i/software/paraview.md @@ -12,14 +12,14 @@ Homepage : <http://www.paraview.org/> ## Installed Version -Currently, version 5.1.2 compiled with intel/2017a against intel MPI library and OSMesa 12.0.2 is installed on Salomon. +Currently, version 5.1.2 compiled with intel/2017a against intel MPI library and OSMesa 12.0.2 is installed on the clusters. ## Usage -On Salomon, ParaView is to be used in client-server mode. A parallel ParaView server is launched on compute nodes by the user, and client is launched on your desktop PC to control and view the visualization. Download ParaView client application for your OS here: <http://paraview.org/paraview/resources/software.php>. +On the clusters, ParaView is to be used in client-server mode. A parallel ParaView server is launched on compute nodes by the user, and client is launched on your desktop PC to control and view the visualization. Download ParaView client application for your OS here: <http://paraview.org/paraview/resources/software.php>. !!!Warning - Your version must match the version number installed on Salomon. + Your version must match the version number installed on the cluster. ### Launching Server @@ -29,21 +29,27 @@ To launch the server, you must first allocate compute nodes, for example $ qsub -I -q qprod -A OPEN-0-0 -l select=2 ``` -to launch an interactive session on 2 nodes. Refer to [Resource Allocation and Job Execution](../job-submission-and-execution/) for details. +to launch an interactive session on 2 nodes. Refer to [Resource Allocation and Job Execution](../salomon/job-submission-and-execution/) for details. -After the interactive session is opened, load the ParaView module : +After the interactive session is opened, load the ParaView module (following examples for Salomon, Anselm instructions in comments): ```console $ ml ParaView/5.1.2-intel-2017a-mpi ``` -Now launch the parallel server, with number of nodes times 24 processes: +Now launch the parallel server, with number of nodes times 24 (16 on Anselm) processes: ```console $ mpirun -np 48 pvserver --use-offscreen-rendering Waiting for client... Connection URL: cs://r37u29n1006:11111 - Accepting connection(s): r37u29n1006:11111 + Accepting connection(s): r37u29n1006:11111i + +Anselm: +$ mpirun -np 32 pvserver --use-offscreen-rendering + Waiting for client... + Connection URL: cs://cn77:11111 + Accepting connection(s): cn77:11111 ``` Note the that the server is listening on compute node r37u29n1006 in this case, we shall use this information later. @@ -53,10 +59,11 @@ Note the that the server is listening on compute node r37u29n1006 in this case, Because a direct connection is not allowed to compute nodes on Salomon, you must establish a SSH tunnel to connect to the server. Choose a port number on your PC to be forwarded to ParaView server, for example 12345. If your PC is running Linux, use this command to establish a SSH tunnel: ```console -$ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz +Salomon: $ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz +Anselm: $ ssh -TN -L 12345:cn77:11111 username@anselm.it4i.cz ``` -replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step). +replace username with your login and r37u29n1006 (cn77) with the name of compute node your ParaView server is running on (see previous step). If you use PuTTY on Windows, load Salomon connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding. diff --git a/docs.it4i/software/singularity.md b/docs.it4i/software/singularity.md index 39618e32c735f1ef1dd02447014015518f51e342..ce285e19bd87696a337148586d7636d6dc87a290 100644 --- a/docs.it4i/software/singularity.md +++ b/docs.it4i/software/singularity.md @@ -1,3 +1,5 @@ +# Singularity Container + [Singularity](http://singularity.lbl.gov/) enables users to have full control of their environment. A non-privileged user can "swap out" the operating system on the host for one they control. So if the host system is running RHEL6 but your application runs in Ubuntu/RHEL7, you can create an Ubuntu/RHEL7 image, install your applications into that image, copy the image to another host, and run your application on that host in it’s native Ubuntu/RHEL7 environment. Singularity also allows you to leverage the resources of whatever host you are on. This includes HPC interconnects, resource managers, file systems, GPUs and/or accelerators, etc. Singularity does this by enabling several key facets: @@ -12,7 +14,7 @@ Singularity also allows you to leverage the resources of whatever host you are o Singularity can import, bootstrap, and even run Docker images directly from [Docker Hub](https://hub.docker.com/). You can easily run RHEL7 like this: ```console -[hrb33@r33u01n865 ~]$ cat /etc/redhat-release +[hrb33@r33u01n865 ~]$ cat /etc/redhat-release CentOS release 6.7 (Final) [hrb33@r33u01n865 ~]$ ml Singularity [hrb33@r33u01n865 ~]$ singularity shell docker://centos:latest @@ -23,8 +25,8 @@ Downloading layer: sha256:45a2e645736c4c66ef34acce2407ded21f7a9b231199d3b92d6c97 Downloading layer: sha256:a3ed95caeb02ffe68cdd9fd84406680ae93d633cb16422d00e8a7c22955b46d4 Singularity: Invoking an interactive shell within container... -Singularity.centos:latest> cat /etc/redhat-release -CentOS Linux release 7.3.1611 (Core) +Singularity.centos:latest> cat /etc/redhat-release +CentOS Linux release 7.3.1611 (Core) ``` ## Creating Own Image from Docker Image @@ -32,7 +34,7 @@ CentOS Linux release 7.3.1611 (Core) ```console hrb33@hrb33-toshiba:/$ cd /tmp/ hrb33@hrb33-toshiba:/tmp$ sudo singularity create /tmp/c7.img -[sudo] password for hrb33: +[sudo] password for hrb33: Creating a new image with a maximum size of 768MiB... Executing image create helper Formatting image with ext3 file system @@ -49,7 +51,7 @@ No bootstrap definition passed, updating container Executing Prebootstrap module Executing Postbootstrap module Done. -hrb33@hrb33-toshiba:/tmp$ sudo singularity shell --writable c7.img +hrb33@hrb33-toshiba:/tmp$ sudo singularity shell --writable c7.img Singularity: Invoking an interactive shell within container... Singularity.c7.img> mkdir /apps /scratch @@ -68,10 +70,10 @@ Accessing /HOME and /SCRATCH Within Container ```console hrb33@hrb33-toshiba:/tmp$ ssh hrb33@login4.salomon - _____ _ - / ____| | | - | (___ __ _| | ___ _ __ ___ ___ _ __ - \___ \ / _` | |/ _ \| '_ ` _ \ / _ \| '_ \ + _____ _ + / ____| | | + | (___ __ _| | ___ _ __ ___ ___ _ __ + \___ \ / _` | |/ _ \| '_ ` _ \ / _ \| '_ \ ____) | (_| | | (_) | | | | | | (_) | | | | |_____/ \__,_|_|\___/|_| |_| |_|\___/|_| |_| @@ -80,7 +82,7 @@ hrb33@hrb33-toshiba:/tmp$ ssh hrb33@login4.salomon Last login: Fri Feb 10 14:38:36 2017 from 10.0.131.12 [hrb33@login4.salomon ~]$ ml Singularity -[hrb33@login4.salomon ~]$ singularity shell --bind /scratch --bind /apps --writable c7.img +[hrb33@login4.salomon ~]$ singularity shell --bind /scratch --bind /apps --writable c7.img Singularity: Invoking an interactive shell within container... Singularity.c7.img> ls /apps/ -l @@ -124,5 +126,5 @@ drwx------ 3 root root 4096 Aug 15 2016 backup drwxr-x--- 2 root root 4096 Dec 5 10:34 sys drwxrwxrwt 154 root root 20480 Feb 14 14:03 temp drwxr-xr-x 4 root root 4096 Jan 25 10:48 work -Singularity.c7.img> +Singularity.c7.img> ``` diff --git a/docs.it4i/software/spack.md b/docs.it4i/software/spack.md index db383a89835aef339807efe82c4135553662e5e4..284283f7cc3da3dc881e010e835b0dfe5b2283a0 100644 --- a/docs.it4i/software/spack.md +++ b/docs.it4i/software/spack.md @@ -20,7 +20,6 @@ $ ml av Spack !!! note Spack/default is rule for setting up local installation - ## First Usage Module Spack/default The Spack will be installed into `~/Spack` folder. You can set the configuration by modifying ~/.spack/configure.yml. @@ -131,21 +130,21 @@ o git | | | | | | |\ | | | | | | | |\ | | | | | | | o | curl -| |_|_|_|_|_|/| | +| |_|_|_|_|_|/| | |/| | | |_|_|/ / -| | | |/| | | | +| | | |/| | | | | | | o | | | | openssl | |_|/ / / / / -|/| | | | | | +|/| | | | | | | | | | o | | gettext | | | | |\ \ \ | | | | | |\ \ \ | | | | | | |\ \ \ | | | | | | | |\ \ \ | | | | | | | o | | | libxml2 -| |_|_|_|_|_|/| | | | -|/| | | | |_|/| | | | -| | | | |/| | | | | | +| |_|_|_|_|_|/| | | | +|/| | | | |_|/| | | | +| | | | |/| | | | | | o | | | | | | | | | | zlib / / / / / / / / / / | | | o | | | | | | xz @@ -163,11 +162,11 @@ o | | | | | pcre | | | o autoconf | | | o m4 | | | o libsigsegv -| | | +| | | o | | libiconv / / | o expat -| +| o bzip2 ``` @@ -180,7 +179,7 @@ $ spack info git Package: git Homepage: http://git-scm.com -Safe versions: +Safe versions: 2.11.0 https://github.com/git/git/tarball/v2.11.0 2.9.3 https://github.com/git/git/tarball/v2.9.3 2.9.2 https://github.com/git/git/tarball/v2.9.2 @@ -209,7 +208,7 @@ Link Dependencies: Run Dependencies: None -Virtual Packages: +Virtual Packages: None Description: @@ -242,7 +241,7 @@ $ spack edit git !!! note To change source link (ftp:// to http://) use `spack create URL -f` to regenerates rules. -**Example** +#### **Example** ```console $ spack install git @@ -257,7 +256,7 @@ curl: (7) couldn't connect to host $ spack create http://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.39.tar.bz2 -f ==> This looks like a URL for pcre ==> Found 2 versions of pcre: - + 8.41 http://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.41.tar.bz2 8.40 http://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.40.tar.bz2 @@ -291,7 +290,7 @@ autoconf@2.69 cmake@3.7.1 expat@2.2.0 git@2.11.0 libsigsegv@2.10 m4 bzip2@1.0.6 curl@7.50.3 gettext@0.19.8.1 libiconv@1.14 libxml2@2.9.4 ncurses@6.0 pcre@8.41 pkg-config@0.29.1 xz@5.2.2 ``` -**Spack colorizes output** +Spack colorizes output. ```console $ spack find | less -R @@ -306,19 +305,19 @@ Neither of these is particularly pretty, easy to remember, or easy to type. Luck ```console $ spack load git ==> This command requires spack's shell integration. - + To initialize spack's shell commands, you must run one of the commands below. Choose the right command for your shell. - + For bash and zsh: . ~/.local/easybuild/software/Spack/0.10.0/share/spack/setup-env.sh - + For csh and tcsh: setenv SPACK_ROOT ~/.local/easybuild/software/Spack/0.10.0 source ~/.local/easybuild/software/Spack/0.10.0/share/spack/setup-env.csh ``` -**First usage** +### First usage ```console $ . ~/.local/easybuild/software/Spack/0.10.0/share/spack/setup-env.sh @@ -342,7 +341,7 @@ You may force uninstall a package with the `--force` option. ```console $ spack uninstall git -==> The following packages will be uninstalled : +==> The following packages will be uninstalled : -- linux-centos6-x86_64 / gcc@4.4.7 ----------------------------- xmh3hmb git@2.11.0%gcc @@ -352,4 +351,4 @@ xmh3hmb git@2.11.0%gcc y ==> Successfully uninstalled git@2.11.0%gcc@4.4.7 arch=linux-centos6-x86_64 -xmh3hmb -``` \ No newline at end of file +``` diff --git a/docs.it4i/anselm/software/virtualization.md b/docs.it4i/software/virtualization.md similarity index 100% rename from docs.it4i/anselm/software/virtualization.md rename to docs.it4i/software/virtualization.md diff --git a/mkdocs.yml b/mkdocs.yml index 610c932b84665c1c38a94b2d03e9c4209deea310..881f5e024bd6117fad1710d2be54fb6236b9760e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,145 +59,106 @@ pages: - Remote Visualization: anselm/remote-visualization.md - PRACE User Support: anselm/prace.md - 'Software': - - Lmod Environment: software/lmod.md - - Modules Matrix: modules-matrix.md - - Singularity Container: software/singularity.md - - EasyBuild: software/easybuild.md - - Spack: software/spack.md - - Salomon Software: - - Available Modules: modules-salomon.md - - Available Modules on UV: modules-salomon-uv.md - - 'Machine learning': - - Introduction: salomon/software/machine-learning/introduction.md - - TensorFlow: salomon/software/machine-learning/tensorflow.md - - 'ANSYS': - - Introduction: salomon/software/ansys/ansys.md - - ANSYS CFX: salomon/software/ansys/ansys-cfx.md - - ANSYS Fluent: salomon/software/ansys/ansys-fluent.md - - ANSYS LS-DYNA: salomon/software/ansys/ansys-ls-dyna.md - - ANSYS MAPDL: salomon/software/ansys/ansys-mechanical-apdl.md - - Workbench: salomon/software/ansys/workbench.md - - Setting License Preferences: salomon/software/ansys/licensing.md - - Licensing and Available Versions: salomon/software/ansys/setting-license-preferences.md - - 'Bioinformatics': - - Bioinformatics Applications: software/bioinformatics.md - - 'Chemistry': - - Molpro: salomon/software/chemistry/molpro.md - - NWChem: salomon/software/chemistry/nwchem.md - - Phono3py: salomon/software/chemistry/phono3py.md - - ORCA: software/orca.md - - Compilers: salomon/software/compilers.md + - 'Modules': + - Lmod Environment: software/lmod.md + - Modules Matrix: modules-matrix.md + - Available Salomon Modules: modules-salomon.md + - Available Salomon Modules on UV: modules-salomon-uv.md + - Available Anselm Modules: modules-anselm.md + - ISV Licenses: software/isv_licenses.md + - 'Bioinformatics': + - Bioinformatics Applications: software/bioinformatics.md + - 'Omics Master': + - Overview: software/omics-master/overview.md + - Diagnostic Component (TEAM): software/omics-master/diagnostic-component-team.md + - Priorization Component (BiERApp): software/omics-master/priorization-component-bierapp.md + - 'CAE': - 'COMSOL': - - COMSOL: salomon/software/comsol/comsol-multiphysics.md - - Licensing and Available Versions: salomon/software/comsol/licensing-and-available-versions.md - - 'Debuggers': - - Introduction: salomon/software/debuggers/Introduction.md - - Aislinn: salomon/software/debuggers/aislinn.md - - Allinea Forge (DDT,MAP): salomon/software/debuggers/allinea-ddt.md - - Allinea Performance Reports: salomon/software/debuggers/allinea-performance-reports.md - - Intel VTune Amplifier XE: salomon/software/debuggers/intel-vtune-amplifier.md - - Total View: salomon/software/debuggers/total-view.md - - Valgrind: salomon/software/debuggers/valgrind.md - - Vampir: salomon/software/debuggers/vampir.md + - COMSOL: software/comsol/comsol-multiphysics.md + - Licensing and Available Versions: software/comsol/licensing-and-available-versions.md + - 'Chemistry': + - Molpro: software/chemistry/molpro.md + - Orca: software/chemistry/orca.md + - NWChem: software/chemistry/nwchem.md + - Phono3py: software/chemistry/phono3py.md + - Compilers: software/compilers.md + - 'Debuggers': + - Introduction: software/debuggers/Introduction.md + - Aislinn: software/debuggers/aislinn.md + - Allinea Forge (DDT,MAP): software/debuggers/allinea-ddt.md + - Allinea Performance Reports: software/debuggers/allinea-performance-reports.md + - CUBE: software/debuggers/cube.md + - Intel Performance Counter Monitor: software/debuggers/intel-performance-counter-monitor.md + - Intel VTune Amplifier XE: software/debuggers/intel-vtune-amplifier.md + - PAPI: software/debuggers/papi.md + - Scalasca: software/debuggers/scalasca.md + - Score-P: software/debuggers/score-p.md + - Total View: software/debuggers/total-view.md + - Valgrind: software/debuggers/valgrind.md + - Vampir: software/debuggers/vampir.md + - 'GPU': + - NVIDIA CUDA: anselm/software/nvidia-cuda.md + - 'Intel': - 'Intel Suite': - - Introduction: salomon/software/intel-suite/intel-parallel-studio-introduction.md - - Intel Advisor: salomon/software/intel-suite/intel-advisor.md - - Intel Compilers: salomon/software/intel-suite/intel-compilers.md - - Intel Debugger: salomon/software/intel-suite/intel-debugger.md - - Intel IPP: salomon/software/intel-suite/intel-integrated-performance-primitives.md - - Intel Inspector: salomon/software/intel-suite/intel-inspector.md - - Intel MKL: salomon/software/intel-suite/intel-mkl.md - - Intel TBB: salomon/software/intel-suite/intel-tbb.md - - Intel Trace Analyzer and Collector: salomon/software/intel-suite/intel-trace-analyzer-and-collector.md - - Intel Xeon Phi: salomon/software/intel-xeon-phi.md - - Java: salomon/software/java.md - - 'MPI': - - Introduction: salomon/software/mpi/mpi.md - - MPI4Py (MPI for Python): salomon/software/mpi/mpi4py-mpi-for-python.md - - Running Open MPI: salomon/software/mpi/Running_OpenMPI.md - - 'Numerical Languages': - - Introduction: salomon/software/numerical-languages/introduction.md - - Clp: salomon/software/numerical-libraries/Clp.md - - Matlab: salomon/software/numerical-languages/matlab.md - - Octave: salomon/software/numerical-languages/octave.md - - R: salomon/software/numerical-languages/r.md - - OpenCoarrays: salomon/software/numerical-languages/opencoarrays.md - - Operating System: salomon/software/operating-system.md - - ParaView: salomon/software/paraview.md - - 'Phys': - - LMGC90: salomon/software/phys/LMGC90.md - - Anselm Software: - - Available Modules: modules-anselm.md - - 'Machine learning': - - Introduction: anselm/software/machine-learning/introduction.md - - TensorFlow: anselm/software/machine-learning/tensorflow.md + - Introduction: software/intel-suite/intel-parallel-studio-introduction.md + - Intel Advisor: software/intel-suite/intel-advisor.md + - Intel Compilers: software/intel-suite/intel-compilers.md + - Intel Debugger: software/intel-suite/intel-debugger.md + - Intel IPP: software/intel-suite/intel-integrated-performance-primitives.md + - Intel Inspector: software/intel-suite/intel-inspector.md + - Intel MKL: software/intel-suite/intel-mkl.md + - Intel TBB: software/intel-suite/intel-tbb.md + - Intel Trace Analyzer and Collector: software/intel-suite/intel-trace-analyzer-and-collector.md + - 'Intel Xeon Phi': + - Intel Xeon Phi Salomon: software/intel-xeon-phi.md + - Intel Xeon Phi Anselm: software/intel-xeon-phi.anselm.md + - 'Machine Learning': + - Introduction: software/machine-learning/introduction.md + - TensorFlow: software/machine-learning/tensorflow.md + - 'MPI': + - Introduction: software/mpi/mpi.md + - MPI4Py (MPI for Python): software/mpi/mpi4py-mpi-for-python.md + - Running Open MPI: software/mpi/Running_OpenMPI.md + - Running MPICH2: software/mpi/running-mpich2.md + - 'Numerical Languages': + - Introduction: software/numerical-languages/introduction.md + - Clp: salomon/software/numerical-libraries/Clp.md + - R: software/numerical-languages/r.md + - Matlab: software/numerical-languages/matlab.md + - Matlab 2013-2014: software/numerical-languages/matlab_1314.md + - Octave: software/numerical-languages/octave.md + - OpenCoarrays: software/numerical-languages/opencoarrays.md + - 'Numerical Libraries': + - FFTW: software/numerical-libraries/fftw.md + - GSL: software/numerical-libraries/gsl.md + - HDF5: software/numerical-libraries/hdf5.md + - Intel Numerical Libraries: software/numerical-libraries/intel-numerical-libraries.md + - MAGMA for Intel Xeon Phi: software/numerical-libraries/magma-for-intel-xeon-phi.md + - PETSc: software/numerical-libraries/petsc.md + - Trilinos: software/numerical-libraries/trilinos.md + - 'Programming Languages': + - Java: software/java.md + - 'Phys': + - LMGC90: salomon/software/phys/LMGC90.md + - 'Tools': - 'ANSYS': - - Introduction: anselm/software/ansys/ansys.md - - ANSYS CFX: anselm/software/ansys/ansys-cfx.md - - ANSYS Fluent: anselm/software/ansys/ansys-fluent.md - - ANSYS LS-DYNA: anselm/software/ansys/ansys-ls-dyna.md - - ANSYS MAPDL: anselm/software/ansys/ansys-mechanical-apdl.md - - LS-DYNA: anselm/software/ansys/ls-dyna.md - - 'Bioinformatics': - - Bioinformatics Applications: software/bioinformatics.md - - 'Debuggers': - - Allinea Forge (DDT,MAP): anselm/software/debuggers/allinea-ddt.md - - Allinea Performance Reports: anselm/software/debuggers/allinea-performance-reports.md - - CUBE: anselm/software/debuggers/cube.md - - Intel Performance Counter Monitor: anselm/software/debuggers/intel-performance-counter-monitor.md - - Intel VTune Amplifier: anselm/software/debuggers/intel-vtune-amplifier.md - - PAPI: anselm/software/debuggers/papi.md - - Scalasca: anselm/software/debuggers/scalasca.md - - Score-P: anselm/software/debuggers/score-p.md - - Total View: anselm/software/debuggers/total-view.md - - VNC: anselm/software/debuggers/debuggers.md - - Valgrind: anselm/software/debuggers/valgrind.md - - Vampir: anselm/software/debuggers/vampir.md - - 'Chemistry': - - Molpro: anselm/software/chemistry/molpro.md - - NWChem: anselm/software/chemistry/nwchem.md - - ORCA: software/orca.md - - COMSOL: anselm/software/comsol-multiphysics.md - - Compilers: anselm/software/compilers.md - - GPI-2: anselm/software/gpi2.md - - 'Intel Suite': - - Introduction: anselm/software/intel-suite/introduction.md - - Intel Compilers: anselm/software/intel-suite/intel-compilers.md - - Intel Debugger: anselm/software/intel-suite/intel-debugger.md - - Intel IPP: anselm/software/intel-suite/intel-integrated-performance-primitives.md - - Intel MKL: anselm/software/intel-suite/intel-mkl.md - - Intel TBB: anselm/software/intel-suite/intel-tbb.md - - Intel Xeon Phi: anselm/software/intel-xeon-phi.md - - ISV Licenses: anselm/software/isv_licenses.md - - Java: anselm/software/java.md - - 'MPI': - - Introduction: anselm/software/mpi/mpi.md - - MPI4Py (MPI for Python): anselm/software/mpi/mpi4py-mpi-for-python.md - - Running Open MPI: anselm/software/mpi/Running_OpenMPI.md - - Running MPICH2: anselm/software/mpi/running-mpich2.md - - 'Numerical Languages': - - Introduction: anselm/software/numerical-languages/introduction.md - - Matlab 2013-2014: anselm/software/numerical-languages/matlab_1314.md - - Matlab: anselm/software/numerical-languages/matlab.md - - Octave: anselm/software/numerical-languages/octave.md - - R: anselm/software/numerical-languages/r.md - - 'Numerical Libraries': - - FFTW: anselm/software/numerical-libraries/fftw.md - - GSL: anselm/software/numerical-libraries/gsl.md - - HDF5: anselm/software/numerical-libraries/hdf5.md - - Intel Numerical Libraries: anselm/software/numerical-libraries/intel-numerical-libraries.md - - MAGMA for Intel Xeon Phi: anselm/software/numerical-libraries/magma-for-intel-xeon-phi.md - - PETSc: anselm/software/numerical-libraries/petsc.md - - Trilinos: anselm/software/numerical-libraries/trilinos.md - - NVIDIA CUDA: anselm/software/nvidia-cuda.md - - 'Omics Master': - - Diagnostic Component (TEAM): anselm/software/omics-master/diagnostic-component-team.md - - Priorization Component (BiERApp): anselm/software/omics-master/priorization-component-bierapp.md - - Overview: anselm/software/omics-master/overview.md - - OpenFOAM: anselm/software/openfoam.md - - Operating System: anselm/software/operating-system.md - - ParaView: anselm/software/paraview.md - - Virtualization: anselm/software/virtualization.md + - Introduction: software/ansys/ansys.md + - ANSYS CFX: software/ansys/ansys-cfx.md + - ANSYS Fluent: software/ansys/ansys-fluent.md + - ANSYS LS-DYNA: software/ansys/ansys-ls-dyna.md + - ANSYS MAPDL: software/ansys/ansys-mechanical-apdl.md + - LS-DYNA: software/ansys/ls-dyna.md + - Workbench: software/ansys/workbench.md + - Setting License Preferences: software/ansys/licensing.md + - Licensing and Available Versions: software/ansys/setting-license-preferences.md + - EasyBuild: software/easybuild.md + - Singularity Container: software/singularity.md + - Spack: software/spack.md + - Virtualization: software/virtualization.md + - 'Visualisation': + - GPI-2: software/gpi2.md + - OpenFOAM: software/openfoam.md + - ParaView: software/paraview.md - PBS Pro Documentation: pbspro.md extra: diff --git a/pathcheck.sh b/pathcheck.sh new file mode 100644 index 0000000000000000000000000000000000000000..932b0fb9c118eaa05616f85b7b095786fb9939bd --- /dev/null +++ b/pathcheck.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +#the script controls links, only inside the whole directory, doesnt control outside pages + +for file in $@; do +check=$(cat "$file" | grep -Po "\[.*?\]\([^ ]*?\)" | grep -v "#" | grep -vE "http|@|www|ftp|none" | sed 's/\[.*\]//g' | sed 's/[()]//g' | sed 's/\/$/.md/g') +if [ ! -z "$check" ]; then + +wrong=0 +for line in $check; do + +pathtocheck=$(dirname "$file")/$line + + +if [ -f $(dirname "$file")/"$line" ]; then + : + #echo "ok $pathtocheck" +else + if [ $wrong -eq "0" ]; then + echo "" + echo "\n+++++ $file +++++\n" + fi + wrong=1 + echo "wrong link in $pathtocheck" + +fi +done +fi +done +echo "" diff --git a/todelete b/todelete new file mode 100644 index 0000000000000000000000000000000000000000..b42e29bda03bfe0d52ee37f5493b08f4938c2208 --- /dev/null +++ b/todelete @@ -0,0 +1,83 @@ +docs.it4i/anselm/software/numerical-languages/introduction.md +docs.it4i/anselm/software/numerical-languages/matlab.md +docs.it4i/anselm/software/numerical-languages/matlab_1314.md +docs.it4i/anselm/software/numerical-languages/octave.md +docs.it4i/anselm/software/numerical-languages/r.md +docs.it4i/salomon/software/comsol/licensing-and-available-versions.md +docs.it4i/salomon/software/java.md +docs.it4i/salomon/software/numerical-languages/introduction.md +docs.it4i/salomon/software/numerical-languages/matlab.md +docs.it4i/salomon/software/numerical-languages/octave.md +docs.it4i/salomon/software/numerical-languages/opencoarrays.md +docs.it4i/salomon/software/numerical-languages/r.md +./docs.it4i/anselm/software/ansys/ansys-cfx.md +./docs.it4i/anselm/software/ansys/ansys-fluent.md +./docs.it4i/anselm/software/ansys/ansys-ls-dyna.md +./docs.it4i/anselm/software/ansys/ansys-mechanical-apdl.md +./docs.it4i/anselm/software/ansys/ansys.md +./docs.it4i/anselm/software/ansys/ls-dyna.md +./docs.it4i/salomon/software/ansys/ansys-cfx.md +./docs.it4i/salomon/software/ansys/ansys-fluent.md +./docs.it4i/salomon/software/ansys/ansys-ls-dyna.md +./docs.it4i/salomon/software/ansys/ansys-mechanical-apdl.md +./docs.it4i/salomon/software/ansys/ansys.md +./docs.it4i/salomon/software/ansys/licensing.md +./docs.it4i/salomon/software/ansys/setting-license-preferences.md +./docs.it4i/salomon/software/ansys/workbench.md +./docs.it4i/anselm/software/machine-learning/introduction.md +./docs.it4i/anselm/software/machine-learning/tensorflow.md +./docs.it4i/salomon/software/machine-learning/introduction.md +./docs.it4i/salomon/software/machine-learning/tensorflow.md +./docs.it4i/anselm/software/debuggers +./docs.it4i/anselm/software/debuggers/allinea-ddt.md +./docs.it4i/anselm/software/debuggers/allinea-performance-reports.md +./docs.it4i/anselm/software/debuggers/cube.md +./docs.it4i/anselm/software/debuggers/debuggers.md +./docs.it4i/anselm/software/debuggers/intel-performance-counter-monitor.md +./docs.it4i/anselm/software/debuggers/intel-vtune-amplifier.md +./docs.it4i/anselm/software/debuggers/papi.md +./docs.it4i/anselm/software/debuggers/scalasca.md +./docs.it4i/anselm/software/debuggers/score-p.md +./docs.it4i/anselm/software/debuggers/total-view.md +./docs.it4i/anselm/software/debuggers/valgrind.md +./docs.it4i/anselm/software/debuggers/vampir.md +./docs.it4i/salomon/software/debuggers +./docs.it4i/salomon/software/debuggers/Introduction.md +./docs.it4i/salomon/software/debuggers/aislinn.md +./docs.it4i/salomon/software/debuggers/allinea-ddt.md +./docs.it4i/salomon/software/debuggers/allinea-performance-reports.md +./docs.it4i/salomon/software/debuggers/intel-vtune-amplifier.md +./docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.html +./docs.it4i/salomon/software/debuggers/mympiprog_32p_2014-10-15_16-56.txt +./docs.it4i/salomon/software/debuggers/total-view.md +./docs.it4i/salomon/software/debuggers/valgrind.md +./docs.it4i/salomon/software/debuggers/vampir.md +./docs.it4i/anselm/software/numerical-libraries +./docs.it4i/anselm/software/numerical-libraries/fftw.md +./docs.it4i/anselm/software/numerical-libraries/gsl.md +./docs.it4i/anselm/software/numerical-libraries/hdf5.md +./docs.it4i/anselm/software/numerical-libraries/intel-numerical-libraries.md +./docs.it4i/anselm/software/numerical-libraries/magma-for-intel-xeon-phi.md +./docs.it4i/anselm/software/numerical-libraries/petsc.md +./docs.it4i/anselm/software/numerical-libraries/trilinos.md +./docs.it4i/anselm/software/intel-suite +./docs.it4i/anselm/software/intel-suite/intel-compilers.md +./docs.it4i/anselm/software/intel-suite/intel-debugger.md +./docs.it4i/anselm/software/intel-suite/intel-integrated-performance-primitives.md +./docs.it4i/anselm/software/intel-suite/intel-mkl.md +./docs.it4i/anselm/software/intel-suite/intel-tbb.md +./docs.it4i/anselm/software/intel-suite/introduction.md +./docs.it4i/salomon/software/intel-suite +./docs.it4i/salomon/software/intel-suite/intel-advisor.md +./docs.it4i/salomon/software/intel-suite/intel-compilers.md +./docs.it4i/salomon/software/intel-suite/intel-debugger.md +./docs.it4i/salomon/software/intel-suite/intel-inspector.md +./docs.it4i/salomon/software/intel-suite/intel-integrated-performance-primitives.md +./docs.it4i/salomon/software/intel-suite/intel-mkl.md +./docs.it4i/salomon/software/intel-suite/intel-parallel-studio-introduction.md +./docs.it4i/salomon/software/intel-suite/intel-tbb.md +./docs.it4i/salomon/software/intel-suite/intel-trace-analyzer-and-collector.md +./docs.it4i/anselm/software/paraview.md +./docs.it4i/anselm/software/compilers.md +./docs.it4i/salomon/software/compilers.md +./docs.it4i/salomon/software/paraview.md