From e9c271149c4a95b86ab369933d1a85980f5f5680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Krup=C4=8D=C3=ADk?= <lukas.krupcik@vsb.cz>
Date: Thu, 19 Sep 2019 10:30:36 +0200
Subject: [PATCH] Dgx upgrade

---
 .spelling                                   |  1 +
 docs.it4i/dgx2/accessing.md                 |  7 +-
 docs.it4i/dgx2/job_execution.md             | 82 ++-------------------
 docs.it4i/dgx2/software.md                  | 29 +++++++-
 docs.it4i/environment-and-modules.md        |  1 +
 docs.it4i/software/tools/ansys/licensing.md |  1 +
 mkdocs.yml                                  |  1 +
 scripts/combinations.py                     |  2 +-
 scripts/get_cvs.sh                          |  9 ++-
 scripts/get_modules.sh                      | 10 ++-
 scripts/matrix.py                           | 19 +++++
 scripts/modules-json.py                     | 41 ++++++-----
 scripts/modules-matrix.py                   | 62 +++++++++-------
 13 files changed, 124 insertions(+), 141 deletions(-)
 create mode 100755 scripts/matrix.py

diff --git a/.spelling b/.spelling
index 99b741e0f..4af97d892 100644
--- a/.spelling
+++ b/.spelling
@@ -1,3 +1,4 @@
+NVIDIA DGX-2
 nvidia
 smi
 nvidia-smi
diff --git a/docs.it4i/dgx2/accessing.md b/docs.it4i/dgx2/accessing.md
index c84bb06e2..e8b9ec995 100644
--- a/docs.it4i/dgx2/accessing.md
+++ b/docs.it4i/dgx2/accessing.md
@@ -1,10 +1,5 @@
 # Accessing the DGX-2
 
-## Before You Access
-
-!!! warning
-    GPUs are single-user devices. Memories of each GPU are not purged between job runs. Furthermore, they can be read (but not written) by any user at any time. Consider the confidentiality of your running jobs.
-
 ## How to Access
 
 !!! info
@@ -14,4 +9,4 @@ The DGX-2 machine can be accessed through the scheduler from Salomon login nodes
 
 The NVIDIA DGX-2 has its own instance of the scheduler, it can be accessed by loading the `DGX-2` module. See [Resource Allocation and Job Execution][1].
 
-[1]: job_execution.md
\ No newline at end of file
+[1]: job_execution.md
diff --git a/docs.it4i/dgx2/job_execution.md b/docs.it4i/dgx2/job_execution.md
index 19666b9f4..549cc1082 100644
--- a/docs.it4i/dgx2/job_execution.md
+++ b/docs.it4i/dgx2/job_execution.md
@@ -2,11 +2,7 @@
 
 To run a job, computational resources of DGX-2 must be allocated.
 
-!!! info
-    You can access the DGX PBS scheduler by loadnig the "DGX-2" module.
-
-The DGX-2 is using independent PBS scheduler. Load the DGX-2 module to access
-the scheduler
+The DGX-2 is using independent PBS scheduler. Load the DGX-2 module to access the scheduler
 
 ```console
 $ml DGX-2
@@ -19,7 +15,7 @@ The resources are allocated to the job in a fair-share fashion, subject to const
 * **qdgx**, the queue for DGX-2 machine
 
 !!! note
-    Job maximum walltime is **4** hours, there might be only **5** jobs in the queue and only **one** running job per user.
+    Job maximum walltime is **4** hours.
 
 ## Job Submission and Execution
 
@@ -30,85 +26,20 @@ The `qsub` submits the job into the queue. The command creates a request to the
 When allocating computational resources for the job, specify:
 
 1. a queue for your job (the default is **qdgx**)
-1. the number of computational nodes required (maximum is **16**, we have only one DGX-2 machine (yet))
 1. the maximum wall time allocated to your calculation (default is **2 hour**, maximum is **4 hour**)
 1. a Jobscript or interactive switch
 
-!!! note
-    Right now, the DGX-2 is divided into 16 computational nodes. Every node contains 6 CPUs (3 physical cores + 3 HT cores), 1 GPU and 96GB of RAM memory.
-
 !!! info
     You can access the DGX PBS scheduler by loadnig the "DGX-2" module.
 
 Submit the job using the `qsub` command:
 
-**Example for 1 GPU**
+**Example**
 
 ```console
 [kru0052@login4.salomon ~]$ ml DGX-2
 PBS 18.1.3 for DGX-2 machine
-[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=1 -l walltime=04:00:00 -I
-qsub: waiting for job 257.ldgx to start
-qsub: job 257.ldgx ready
-
-kru0052@dgx:~$ nvidia-smi
-Thu Mar 14 07:46:01 2019
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|===============================+======================+======================|
-|   0  Tesla V100-SXM3...  On   | 00000000:57:00.0 Off |                    0 |
-| N/A   29C    P0    50W / 350W |      0MiB / 32480MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-kru0052@dgx:~$ exit
-[kru0052@login4.salomon ~]$ ml purge
-PBS 13.1.1 for cluster Salomon
-[kru0052@login4.salomon ~]$
-```
-
-**Example for 4 GPU**
-
-```console
-[kru0052@login4.salomon ~]$ ml DGX-2
-PBS 18.1.3 for DGX-2 machine
-[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=4 -l walltime=04:00:00 -I
-qsub: waiting for job 256.ldgx to start
-qsub: job 256.ldgx ready
-
-kru0052@dgx:~$ nvidia-smi
-Thu Mar 14 07:45:29 2019
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|===============================+======================+======================|
-|   0  Tesla V100-SXM3...  On   | 00000000:57:00.0 Off |                    0 |
-| N/A   29C    P0    50W / 350W |      0MiB / 32480MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   1  Tesla V100-SXM3...  On   | 00000000:59:00.0 Off |                    0 |
-| N/A   35C    P0    51W / 350W |      0MiB / 32480MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   2  Tesla V100-SXM3...  On   | 00000000:5C:00.0 Off |                    0 |
-| N/A   30C    P0    50W / 350W |      0MiB / 32480MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   3  Tesla V100-SXM3...  On   | 00000000:5E:00.0 Off |                    0 |
-| N/A   35C    P0    53W / 350W |      0MiB / 32480MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-kru0052@dgx:~$ exit
-[kru0052@login4.salomon ~]$ ml purge
-PBS 13.1.1 for cluster Salomon
-[kru0052@login4.salomon ~]$
-```
-
-**Example for 16 GPU (all DGX-2)**
-
-```console
-[kru0052@login4.salomon ~]$ ml DGX-2
-PBS 18.1.3 for DGX-2 machine
-[kru0052@login4.salomon ~]$ qsub -q qdgx -l select=16 -l walltime=04:00:00 -I
+[kru0052@login4.salomon ~]$ qsub -q qdgx -l walltime=04:00:00 -I
 qsub: waiting for job 258.ldgx to start
 qsub: job 258.ldgx ready
 
@@ -177,9 +108,6 @@ PBS 13.1.1 for cluster Salomon
 !!! tip
     Submit the intreractive job using the `qsub -I ...` command.
 
-!!! info
-    You can determine allocated GPUs from environment variable **CUDA_ALLOCATED_DEVICES**. Variable **CUDA_VISIBLE_DEVICES** has to be count from **0** every time!
-
 ### Job Execution
 
 The DGX-2 machine runs only a bare-bone, minimal operating system. Users are expected to run
@@ -195,7 +123,7 @@ to download the container via singularity, see example below:
 ```console
 [kru0052@login4.salomon ~]$ ml DGX-2
 PBS 18.1.3 for DGX-2 machine
-$ qsub -q qdgx -l select=16 -l walltime=01:00:00 -I
+$ qsub -q qdgx -l walltime=01:00:00 -I
 qsub: waiting for job 96.ldgx to start
 qsub: job 96.ldgx ready
 
diff --git a/docs.it4i/dgx2/software.md b/docs.it4i/dgx2/software.md
index bf91c9e66..db1257271 100644
--- a/docs.it4i/dgx2/software.md
+++ b/docs.it4i/dgx2/software.md
@@ -19,11 +19,34 @@ NVIDIA expects usage of Docker as a containerization tool, but Docker is not a s
 
 Singularity can be used very similar to Docker, the only change is a rewrite of an image URL address. For example, original command for Docker `docker run -it nvcr.io/nvidia/theano:18.08` should be rewritten to `singularity shell docker://nvcr.io/nvidia/theano:18.08`. More about Singularity [here][1].
 
-!!! info
-    The `--nv` Singularity switch is used by default on DGX-2.
-
 For fast container deployment, all images are cached after first use in *lscratch* directory. This behavior can be changed by *SINGULARITY_CACHEDIR* environment variable, but the start time of container will increase significantly.
 
+```console
+$ ml av Singularity
+
+---------------------------- /apps/modules/tools ----------------------------
+   Singularity/3.3.0
+```
+
+## MPI Modules
+
+```console
+$ ml av MPI
+
+---------------------------- /apps/modules/mpi ----------------------------
+   OpenMPI/2.1.5-GCC-6.3.0-2.27    OpenMPI/3.1.4-GCC-6.3.0-2.27    OpenMPI/4.0.0-GCC-6.3.0-2.27 (D)    impi/2017.4.239-iccifort-2017.7.259-GCC-6.3.0-2.27
+```
+
+## Compiler Modules
+
+```console
+$ ml av gcc
+
+---------------------------- /apps/modules/compiler ----------------------------
+   GCC/6.3.0-2.27    GCCcore/6.3.0    icc/2017.7.259-GCC-6.3.0-2.27    ifort/2017.7.259-GCC-6.3.0-2.27
+
+```
+
 [1]: ../software/tools/singularity.md
 [a]: https://ngc.nvidia.com/catalog/landing
 [b]: https://www.sylabs.io/
diff --git a/docs.it4i/environment-and-modules.md b/docs.it4i/environment-and-modules.md
index fafcadce3..d3b631967 100644
--- a/docs.it4i/environment-and-modules.md
+++ b/docs.it4i/environment-and-modules.md
@@ -8,6 +8,7 @@ The table shows which shells are supported on IT4Innovations clusters.
 | --------------- | ---- | ---- | --- | --- |
 | Anselm Cluster  | yes  | yes  | yes | yes |
 | Salomon Cluster | yes  | yes  | yes | yes |
+| DGX-2 Cluster   | yes  | no   | no  | no  |
 
 !!! info
     BASH is the default shell. Should you need a different shell write an email to support@it4i.cz.
diff --git a/docs.it4i/software/tools/ansys/licensing.md b/docs.it4i/software/tools/ansys/licensing.md
index bccd7cf64..17c91235c 100644
--- a/docs.it4i/software/tools/ansys/licensing.md
+++ b/docs.it4i/software/tools/ansys/licensing.md
@@ -15,6 +15,7 @@ The licence intended to be used for science and research, publications, students
 The licence intended to be used for science and research, publications, students’ projects, commercial research with no commercial use restrictions.
 
 ## Server / Port
+
 lic-ansys.vsb.cz / 1055 (2325)
 
 ## Available Versions
diff --git a/mkdocs.yml b/mkdocs.yml
index 0e9d92d02..04c8362f5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -94,6 +94,7 @@ nav:
       - Available Salomon Modules: modules-salomon.md
       - Available Salomon Modules on UV: modules-salomon-uv.md
       - Available Salomon Modules on PHI Cards: modules-salomon-phi.md
+      - Available DGX-2 Modules: modules-dgx.md  
     - ISV Licenses: software/isv_licenses.md
     - Bioinformatics:
       - Bioinformatics Applications: software/bio/bioinformatics.md
diff --git a/scripts/combinations.py b/scripts/combinations.py
index 8186da1ab..067f39f0c 100644
--- a/scripts/combinations.py
+++ b/scripts/combinations.py
@@ -1,6 +1,6 @@
 import itertools
 import re
-l = ['A', 'S', 'U', 'T']
+l = ['A', 'S', 'U', 'T', 'D']
 mask = ''.join(reversed(l))
 for i in range(1,len(l)+1):
     for comb in itertools.combinations(l, i):
diff --git a/scripts/get_cvs.sh b/scripts/get_cvs.sh
index 703af83ef..eaf721d0a 100755
--- a/scripts/get_cvs.sh
+++ b/scripts/get_cvs.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o modules-anselm.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o modules-salomon.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o modules-salomon-uv.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o modules-salomon-phi.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o anselm.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o salomon.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.csv -o dgx.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o uv2000.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o phi.csv
diff --git a/scripts/get_modules.sh b/scripts/get_modules.sh
index bade433da..882b5ff78 100755
--- a/scripts/get_modules.sh
+++ b/scripts/get_modules.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.md -o docs.it4i/modules-anselm.md
 curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.md -o docs.it4i/modules-salomon.md
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.md -o docs.it4i/modules-dgx.md
 curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.md -o docs.it4i/modules-salomon-uv.md
 curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.md -o docs.it4i/modules-salomon-phi.md
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o scripts/modules-anselm.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o scripts/modules-salomon.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o scripts/modules-salomon-uv.csv
-curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o scripts/modules-salomon-phi.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/anselm.csv -o scripts/anselm.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/salomon.csv -o scripts/salomon.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/dgx.csv -o scripts/dgx.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/uv2000.csv -o scripts/uv2000.csv
+curl -s https://code.it4i.cz/sccs/it4i-modules/raw/master/phi.csv -o scripts/phi.csv
diff --git a/scripts/matrix.py b/scripts/matrix.py
new file mode 100755
index 000000000..1765624f5
--- /dev/null
+++ b/scripts/matrix.py
@@ -0,0 +1,19 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import csv
+import collections
+import itertools
+import re
+from distutils.version import LooseVersion
+l = ['A', 'S', 'U', 'P', 'D']
+mask = ''.join(reversed(l))
+from itertools import product
+for bits in product([0, 1], repeat=len(l)):
+    s = "".join(str(bit) for bit in bits)
+    ns = ""
+    for i in range(len(s)):
+        if s[i] == "1":
+            ns += mask[i]
+        else:
+            ns += "-"
+    print ns
diff --git a/scripts/modules-json.py b/scripts/modules-json.py
index 57474110d..fa7934182 100755
--- a/scripts/modules-json.py
+++ b/scripts/modules-json.py
@@ -1,8 +1,12 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
+
 import csv
 import collections
 import json
+import itertools
+import re
+
 from distutils.version import LooseVersion
 
 def get_data(filename):
@@ -15,30 +19,29 @@ def get_data(filename):
         return list(reader)  # only return the reader when you have finished.
 
 your_list = []
-your_list += get_data('./scripts/modules-anselm.csv')
-your_list += get_data('./scripts/modules-salomon.csv')
-your_list += get_data('./scripts/modules-salomon-uv.csv')
-#print your_list
+your_list += get_data('./scripts/anselm.csv')
+your_list += get_data('./scripts/salomon.csv')
+your_list += get_data('./scripts/uv2000.csv')
+your_list += get_data('./scripts/phi.csv')
+your_list += get_data('./scripts/dgx.csv')
 
-#a=[["python/2.8.1",1],["python/2.9.1",2],["python/2.8.1",4],["python/3.0.1",4]]
 counts = dict()
 for i in your_list:
-  #print i[0]
-  #print int(i[1])
   counts[i[0]]=counts.get(i[0], 0) + int(i[1])
 
-#print sorted(counts.items())
-
-c=[
-"---",
-"--A",
-"-S-",
-"-SA",
-"U--",
-"U-A",
-"US-",
-"USA",
-]
+l = ['A', 'S', 'U', 'P', 'D']
+c = []
+mask = ''.join(reversed(l))
+from itertools import product
+for bits in product([0, 1], repeat=len(l)):
+    s = "".join(str(bit) for bit in bits)
+    ns = ""
+    for i in range(len(s)):
+        if s[i] == "1":
+            ns += mask[i]
+        else:
+            ns += "-"
+    c.append(ns)
 
 software = dict()
 versions = ''
diff --git a/scripts/modules-matrix.py b/scripts/modules-matrix.py
index 70cb5118d..6329510f4 100755
--- a/scripts/modules-matrix.py
+++ b/scripts/modules-matrix.py
@@ -1,7 +1,11 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
+
 import csv
 import collections
+import itertools
+import re
+
 from distutils.version import LooseVersion
 
 def get_data(filename):
@@ -14,41 +18,47 @@ def get_data(filename):
         return list(reader)  # only return the reader when you have finished.
 
 your_list = []
-your_list += get_data('./scripts/modules-anselm.csv')
-your_list += get_data('./scripts/modules-salomon.csv')
-your_list += get_data('./scripts/modules-salomon-uv.csv')
-#print your_list
+your_list += get_data('./scripts/anselm.csv')
+your_list += get_data('./scripts/salomon.csv')
+your_list += get_data('./scripts/uv2000.csv')
+your_list += get_data('./scripts/phi.csv')
+your_list += get_data('./scripts/dgx.csv')
+
+print your_list
 
-#a=[["python/2.8.1",1],["python/2.9.1",2],["python/2.8.1",4],["python/3.0.1",4]]
 counts = dict()
 for i in your_list:
   #print i[0]
   #print int(i[1])
   counts[i[0]]=counts.get(i[0], 0) + int(i[1])
 
-#print sorted(counts.items())
-
-c=[
-"---",
-"--A",
-"-S-",
-"-SA",
-"U--",
-"U-A",
-"US-",
-"USA",
-]
+l = ['A', 'S', 'U', 'P', 'D', 'B']
+c = []
+mask = ''.join(reversed(l))
+from itertools import product
+for bits in product([0, 1], repeat=len(l)):
+    s = "".join(str(bit) for bit in bits)
+    ns = ""
+    for i in range(len(s)):
+        if s[i] == "1":
+            ns += mask[i]
+        else:
+            ns += "-"
+    c.append(ns)
 
 print '!!! Hint "Cluster Acronyms"'
 print '    ```'
-print '    USA'
-print '    |||'
-print '    ||+---->Anselm'
-print '    |+----->Salomon'
-print '    +------>UV2000'
+print '    N D P U S A'
+print '    | | | | | |'
+print '    | | | | | +----> Anselm'
+print '    | | | | +------> Salomon'
+print '    | | | +--------> UV2000'
+print '    | | +----------> Phi'
+print '    | +------------> DGX-2'
+print '    +--------------> New Cluster'
 print '    ```'
 print
-print '| Module </br><form><input id="searchInput" placeholder="🔍 Filter" style="width: 8rem; border-radius: 0.2rem; color: black; padding-left: .2rem;"><form> | Versions | Clusters |'
+print '| Module </br><input id="searchInput" placeholder="🔍 Filter" style="width: 8rem; border-radius: 0.2rem; color: black; padding-left: .2rem;"> | Versions | Clusters |'
 print "| ------ | -------- | -------- |"
 
 software = dict()
@@ -57,9 +67,9 @@ clusters = ''
 prev = ''
 
 for m,i in sorted(counts.items()):
-  #print m
+#  print m
   split =  m.split('/')
-  #print split
+#  print split
   if len(split) > 1:
     a = split[0]
     b = split[1]
@@ -68,8 +78,6 @@ for m,i in sorted(counts.items()):
     software[a][b] = '`' + c[i] + '`'
     prev = a
 
-#print software.items()
-
 for m in sorted(software.items(), key=lambda i: i[0].lower()):
   software = m[0]
   versions = []
-- 
GitLab