From e9c271149c4a95b86ab369933d1a85980f5f5680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Krup=C4=8D=C3=ADk?= <lukas.krupcik@vsb.cz> Date: Thu, 19 Sep 2019 10:30:36 +0200 Subject: [PATCH] Dgx upgrade --- .spelling | 1 + docs.it4i/dgx2/accessing.md | 7 +- docs.it4i/dgx2/job_execution.md | 82 ++------------------- docs.it4i/dgx2/software.md | 29 +++++++- docs.it4i/environment-and-modules.md | 1 + docs.it4i/software/tools/ansys/licensing.md | 1 + mkdocs.yml | 1 + scripts/combinations.py | 2 +- scripts/get_cvs.sh | 9 ++- scripts/get_modules.sh | 10 ++- scripts/matrix.py | 19 +++++ scripts/modules-json.py | 41 ++++++----- scripts/modules-matrix.py | 62 +++++++++------- 13 files changed, 124 insertions(+), 141 deletions(-) create mode 100755 scripts/matrix.py diff --git a/.spelling b/.spelling index 99b741e0f..4af97d892 100644 --- a/.spelling +++ b/.spelling @@ -1,3 +1,4 @@ +NVIDIA DGX-2 nvidia smi nvidia-smi diff --git a/docs.it4i/dgx2/accessing.md b/docs.it4i/dgx2/accessing.md index c84bb06e2..e8b9ec995 100644 --- a/docs.it4i/dgx2/accessing.md +++ b/docs.it4i/dgx2/accessing.md @@ -1,10 +1,5 @@ # Accessing the DGX-2 -## Before You Access - -!!! warning - GPUs are single-user devices. Memories of each GPU are not purged between job runs. Furthermore, they can be read (but not written) by any user at any time. Consider the confidentiality of your running jobs. - ## How to Access !!! info @@ -14,4 +9,4 @@ The DGX-2 machine can be accessed through the scheduler from Salomon login nodes The NVIDIA DGX-2 has its own instance of the scheduler, it can be accessed by loading the `DGX-2` module. See [Resource Allocation and Job Execution][1]. -[1]: job_execution.md \ No newline at end of file +[1]: job_execution.md diff --git a/docs.it4i/dgx2/job_execution.md b/docs.it4i/dgx2/job_execution.md index 19666b9f4..549cc1082 100644 --- a/docs.it4i/dgx2/job_execution.md +++ b/docs.it4i/dgx2/job_execution.md @@ -2,11 +2,7 @@ To run a job, computational resources of DGX-2 must be allocated. -!!! info - You can access the DGX PBS scheduler by loadnig the "DGX-2" module. - -The DGX-2 is using independent PBS scheduler. Load the DGX-2 module to access -the scheduler +The DGX-2 is using independent PBS scheduler. Load the DGX-2 module to access the scheduler ```console $ml DGX-2 @@ -19,7 +15,7 @@ The resources are allocated to the job in a fair-share fashion, subject to const * **qdgx**, the queue for DGX-2 machine !!! note - Job maximum walltime is **4** hours, there might be only **5** jobs in the queue and only **one** running job per user. + Job maximum walltime is **4** hours. ## Job Submission and Execution @@ -30,85 +26,20 @@ The `qsub` submits the job into the queue. The command creates a request to the When allocating computational resources for the job, specify: 1. a queue for your job (the default is **qdgx**) -1. the number of computational nodes required (maximum is **16**, we have only one DGX-2 machine (yet)) 1. the maximum wall time allocated to your calculation (default is **2 hour**, maximum is **4 hour**) 1. a Jobscript or interactive switch -!!! note - Right now, the DGX-2 is divided into 16 computational nodes. Every node contains 6 CPUs (3 physical cores + 3 HT cores), 1 GPU and 96GB of RAM memory. - !!! info You can access the DGX PBS scheduler by loadnig the "DGX-2" module. Submit the job using the `qsub` command: -**Example for 1 GPU** +**Example** ```console [kru0052@login4.salomon ~]$ ml DGX-2 PBS 18.1.3 for DGX-2 machine -[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=1 -l walltime=04:00:00 -I -qsub: waiting for job 257.ldgx to start -qsub: job 257.ldgx ready - -kru0052@dgx:~$ nvidia-smi -Thu Mar 14 07:46:01 2019 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 410.104 Driver Version: 410.104 CUDA Version: 10.0 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 Tesla V100-SXM3... On | 00000000:57:00.0 Off | 0 | -| N/A 29C P0 50W / 350W | 0MiB / 32480MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -kru0052@dgx:~$ exit -[kru0052@login4.salomon ~]$ ml purge -PBS 13.1.1 for cluster Salomon -[kru0052@login4.salomon ~]$ -``` - -**Example for 4 GPU** - -```console -[kru0052@login4.salomon ~]$ ml DGX-2 -PBS 18.1.3 for DGX-2 machine -[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=4 -l walltime=04:00:00 -I -qsub: waiting for job 256.ldgx to start -qsub: job 256.ldgx ready - -kru0052@dgx:~$ nvidia-smi -Thu Mar 14 07:45:29 2019 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 410.104 Driver Version: 410.104 CUDA Version: 10.0 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 Tesla V100-SXM3... On | 00000000:57:00.0 Off | 0 | -| N/A 29C P0 50W / 350W | 0MiB / 32480MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -| 1 Tesla V100-SXM3... On | 00000000:59:00.0 Off | 0 | -| N/A 35C P0 51W / 350W | 0MiB / 32480MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -| 2 Tesla V100-SXM3... On | 00000000:5C:00.0 Off | 0 | -| N/A 30C P0 50W / 350W | 0MiB / 32480MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -| 3 Tesla V100-SXM3... On | 00000000:5E:00.0 Off | 0 | -| N/A 35C P0 53W / 350W | 0MiB / 32480MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -kru0052@dgx:~$ exit -[kru0052@login4.salomon ~]$ ml purge -PBS 13.1.1 for cluster Salomon -[kru0052@login4.salomon ~]$ -``` - -**Example for 16 GPU (all DGX-2)** - -```console -[kru0052@login4.salomon ~]$ ml DGX-2 -PBS 18.1.3 for DGX-2 machine -[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=16 -l walltime=04:00:00 -I +[kru0052@login4.salomon ~]$ qsub -q qdgx -l walltime=04:00:00 -I qsub: waiting for job 258.ldgx to start qsub: job 258.ldgx ready @@ -177,9 +108,6 @@ PBS 13.1.1 for cluster Salomon !!! tip Submit the intreractive job using the `qsub -I ...` command. -!!! info - You can determine allocated GPUs from environment variable **CUDA_ALLOCATED_DEVICES**. Variable **CUDA_VISIBLE_DEVICES** has to be count from **0** every time! - ### Job Execution The DGX-2 machine runs only a bare-bone, minimal operating system. Users are expected to run @@ -195,7 +123,7 @@ to download the container via singularity, see example below: ```console [kru0052@login4.salomon ~]$ ml DGX-2 PBS 18.1.3 for DGX-2 machine -$ qsub -q qdgx -l select=16 -l walltime=01:00:00 -I +$ qsub -q qdgx -l walltime=01:00:00 -I qsub: waiting for job 96.ldgx to start qsub: job 96.ldgx ready diff --git a/docs.it4i/dgx2/software.md b/docs.it4i/dgx2/software.md index bf91c9e66..db1257271 100644 --- a/docs.it4i/dgx2/software.md +++ b/docs.it4i/dgx2/software.md @@ -19,11 +19,34 @@ NVIDIA expects usage of Docker as a containerization tool, but Docker is not a s Singularity can be used very similar to Docker, the only change is a rewrite of an image URL address. For example, original command for Docker `docker run -it nvcr.io/nvidia/theano:18.08` should be rewritten to `singularity shell docker://nvcr.io/nvidia/theano:18.08`. More about Singularity [here][1]. -!!! info - The `--nv` Singularity switch is used by default on DGX-2. - For fast container deployment, all images are cached after first use in *lscratch* directory. This behavior can be changed by *SINGULARITY_CACHEDIR* environment variable, but the start time of container will increase significantly. +```console +$ ml av Singularity + +---------------------------- /apps/modules/tools ---------------------------- + Singularity/3.3.0 +``` + +## MPI Modules + +```console +$ ml av MPI + +---------------------------- /apps/modules/mpi ---------------------------- + OpenMPI/2.1.5-GCC-6.3.0-2.27 OpenMPI/3.1.4-GCC-6.3.0-2.27 OpenMPI/4.0.0-GCC-6.3.0-2.27 (D) impi/2017.4.239-iccifort-2017.7.259-GCC-6.3.0-2.27 +``` + +## Compiler Modules + +```console +$ ml av gcc + +---------------------------- /apps/modules/compiler ---------------------------- + GCC/6.3.0-2.27 GCCcore/6.3.0 icc/2017.7.259-GCC-6.3.0-2.27 ifort/2017.7.259-GCC-6.3.0-2.27 + +``` + [1]: ../software/tools/singularity.md [a]: https://ngc.nvidia.com/catalog/landing [b]: https://www.sylabs.io/ diff --git a/docs.it4i/environment-and-modules.md b/docs.it4i/environment-and-modules.md index fafcadce3..d3b631967 100644 --- a/docs.it4i/environment-and-modules.md +++ b/docs.it4i/environment-and-modules.md @@ -8,6 +8,7 @@ The table shows which shells are supported on IT4Innovations clusters. | --------------- | ---- | ---- | --- | --- | | Anselm Cluster | yes | yes | yes | yes | | Salomon Cluster | yes | yes | yes | yes | +| DGX-2 Cluster | yes | no | no | no | !!! info BASH is the default shell. Should you need a different shell write an email to support@it4i.cz. diff --git a/docs.it4i/software/tools/ansys/licensing.md b/docs.it4i/software/tools/ansys/licensing.md index bccd7cf64..17c91235c 100644 --- a/docs.it4i/software/tools/ansys/licensing.md +++ b/docs.it4i/software/tools/ansys/licensing.md @@ -15,6 +15,7 @@ The licence intended to be used for science and research, publications, students The licence intended to be used for science and research, publications, students’ projects, commercial research with no commercial use restrictions. ## Server / Port + lic-ansys.vsb.cz / 1055 (2325) ## Available Versions diff --git a/mkdocs.yml b/mkdocs.yml index 0e9d92d02..04c8362f5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -94,6 +94,7 @@ nav: - Available Salomon Modules: modules-salomon.md - Available Salomon Modules on UV: modules-salomon-uv.md - Available Salomon Modules on PHI Cards: modules-salomon-phi.md + - Available DGX-2 Modules: modules-dgx.md - ISV Licenses: software/isv_licenses.md - Bioinformatics: - Bioinformatics Applications: software/bio/bioinformatics.md diff --git a/scripts/combinations.py b/scripts/combinations.py index 8186da1ab..067f39f0c 100644 --- a/scripts/combinations.py +++ b/scripts/combinations.py @@ -1,6 +1,6 @@ import itertools import re -l = ['A', 'S', 'U', 'T'] +l = ['A', 'S', 'U', 'T', 'D'] mask = ''.join(reversed(l)) for i in range(1,len(l)+1): for comb in itertools.combinations(l, i): diff --git a/scripts/get_cvs.sh b/scripts/get_cvs.sh index 703af83ef..eaf721d0a 100755 --- a/scripts/get_cvs.sh +++ b/scripts/get_cvs.sh @@ -1,5 +1,6 @@ #!/bin/bash -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o modules-anselm.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o modules-salomon.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o modules-salomon-uv.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o modules-salomon-phi.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o anselm.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o salomon.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.csv -o dgx.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o uv2000.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o phi.csv diff --git a/scripts/get_modules.sh b/scripts/get_modules.sh index bade433da..882b5ff78 100755 --- a/scripts/get_modules.sh +++ b/scripts/get_modules.sh @@ -1,9 +1,11 @@ #!/bin/bash curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.md -o docs.it4i/modules-anselm.md curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.md -o docs.it4i/modules-salomon.md +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.md -o docs.it4i/modules-dgx.md curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.md -o docs.it4i/modules-salomon-uv.md curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.md -o docs.it4i/modules-salomon-phi.md -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o scripts/modules-anselm.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o scripts/modules-salomon.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o scripts/modules-salomon-uv.csv -curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o scripts/modules-salomon-phi.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o scripts/anselm.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o scripts/salomon.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.csv -o scripts/dgx.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o scripts/uv2000.csv +curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o scripts/phi.csv diff --git a/scripts/matrix.py b/scripts/matrix.py new file mode 100755 index 000000000..1765624f5 --- /dev/null +++ b/scripts/matrix.py @@ -0,0 +1,19 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import csv +import collections +import itertools +import re +from distutils.version import LooseVersion +l = ['A', 'S', 'U', 'P', 'D'] +mask = ''.join(reversed(l)) +from itertools import product +for bits in product([0, 1], repeat=len(l)): + s = "".join(str(bit) for bit in bits) + ns = "" + for i in range(len(s)): + if s[i] == "1": + ns += mask[i] + else: + ns += "-" + print ns diff --git a/scripts/modules-json.py b/scripts/modules-json.py index 57474110d..fa7934182 100755 --- a/scripts/modules-json.py +++ b/scripts/modules-json.py @@ -1,8 +1,12 @@ #!/usr/bin/python # -*- coding: utf-8 -*- + import csv import collections import json +import itertools +import re + from distutils.version import LooseVersion def get_data(filename): @@ -15,30 +19,29 @@ def get_data(filename): return list(reader) # only return the reader when you have finished. your_list = [] -your_list += get_data('./scripts/modules-anselm.csv') -your_list += get_data('./scripts/modules-salomon.csv') -your_list += get_data('./scripts/modules-salomon-uv.csv') -#print your_list +your_list += get_data('./scripts/anselm.csv') +your_list += get_data('./scripts/salomon.csv') +your_list += get_data('./scripts/uv2000.csv') +your_list += get_data('./scripts/phi.csv') +your_list += get_data('./scripts/dgx.csv') -#a=[["python/2.8.1",1],["python/2.9.1",2],["python/2.8.1",4],["python/3.0.1",4]] counts = dict() for i in your_list: - #print i[0] - #print int(i[1]) counts[i[0]]=counts.get(i[0], 0) + int(i[1]) -#print sorted(counts.items()) - -c=[ -"---", -"--A", -"-S-", -"-SA", -"U--", -"U-A", -"US-", -"USA", -] +l = ['A', 'S', 'U', 'P', 'D'] +c = [] +mask = ''.join(reversed(l)) +from itertools import product +for bits in product([0, 1], repeat=len(l)): + s = "".join(str(bit) for bit in bits) + ns = "" + for i in range(len(s)): + if s[i] == "1": + ns += mask[i] + else: + ns += "-" + c.append(ns) software = dict() versions = '' diff --git a/scripts/modules-matrix.py b/scripts/modules-matrix.py index 70cb5118d..6329510f4 100755 --- a/scripts/modules-matrix.py +++ b/scripts/modules-matrix.py @@ -1,7 +1,11 @@ #!/usr/bin/python # -*- coding: utf-8 -*- + import csv import collections +import itertools +import re + from distutils.version import LooseVersion def get_data(filename): @@ -14,41 +18,47 @@ def get_data(filename): return list(reader) # only return the reader when you have finished. your_list = [] -your_list += get_data('./scripts/modules-anselm.csv') -your_list += get_data('./scripts/modules-salomon.csv') -your_list += get_data('./scripts/modules-salomon-uv.csv') -#print your_list +your_list += get_data('./scripts/anselm.csv') +your_list += get_data('./scripts/salomon.csv') +your_list += get_data('./scripts/uv2000.csv') +your_list += get_data('./scripts/phi.csv') +your_list += get_data('./scripts/dgx.csv') + +print your_list -#a=[["python/2.8.1",1],["python/2.9.1",2],["python/2.8.1",4],["python/3.0.1",4]] counts = dict() for i in your_list: #print i[0] #print int(i[1]) counts[i[0]]=counts.get(i[0], 0) + int(i[1]) -#print sorted(counts.items()) - -c=[ -"---", -"--A", -"-S-", -"-SA", -"U--", -"U-A", -"US-", -"USA", -] +l = ['A', 'S', 'U', 'P', 'D', 'B'] +c = [] +mask = ''.join(reversed(l)) +from itertools import product +for bits in product([0, 1], repeat=len(l)): + s = "".join(str(bit) for bit in bits) + ns = "" + for i in range(len(s)): + if s[i] == "1": + ns += mask[i] + else: + ns += "-" + c.append(ns) print '!!! Hint "Cluster Acronyms"' print ' ```' -print ' USA' -print ' |||' -print ' ||+---->Anselm' -print ' |+----->Salomon' -print ' +------>UV2000' +print ' N D P U S A' +print ' | | | | | |' +print ' | | | | | +----> Anselm' +print ' | | | | +------> Salomon' +print ' | | | +--------> UV2000' +print ' | | +----------> Phi' +print ' | +------------> DGX-2' +print ' +--------------> New Cluster' print ' ```' print -print '| Module </br><form><input id="searchInput" placeholder="🔍 Filter" style="width: 8rem; border-radius: 0.2rem; color: black; padding-left: .2rem;"><form> | Versions | Clusters |' +print '| Module </br><input id="searchInput" placeholder="🔍 Filter" style="width: 8rem; border-radius: 0.2rem; color: black; padding-left: .2rem;"> | Versions | Clusters |' print "| ------ | -------- | -------- |" software = dict() @@ -57,9 +67,9 @@ clusters = '' prev = '' for m,i in sorted(counts.items()): - #print m +# print m split = m.split('/') - #print split +# print split if len(split) > 1: a = split[0] b = split[1] @@ -68,8 +78,6 @@ for m,i in sorted(counts.items()): software[a][b] = '`' + c[i] + '`' prev = a -#print software.items() - for m in sorted(software.items(), key=lambda i: i[0].lower()): software = m[0] versions = [] -- GitLab