diff --git a/docs.it4i/general/slurm-job-submission-and-execution.md b/docs.it4i/general/slurm-job-submission-and-execution.md index acf00fe3f57af619e3609ac1df63f06064046ace..4885bb3a9d3578b3ff00e7f79f0710738e13c394 100644 --- a/docs.it4i/general/slurm-job-submission-and-execution.md +++ b/docs.it4i/general/slurm-job-submission-and-execution.md @@ -300,9 +300,9 @@ $ sbatch --parsable --dependency=afterok:${second} job3.sh 1581 $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1579 p01-arm job1 opr0019 PD 0:00 8 (Priority) - 1580 p01-arm job2 opr0019 PD 0:00 8 (Dependency) - 1581 p01-arm job3 opr0019 PD 0:00 8 (Dependency) + 1579 p01-arm job1 user017 PD 0:00 8 (Priority) + 1580 p01-arm job2 user017 PD 0:00 8 (Dependency) + 1581 p01-arm job3 user017 PD 0:00 8 (Dependency) $ scontrol show job 1580 | grep JobState JobState=PENDING Reason=Dependency Dependency=afterok:1579(unfulfilled) $ scontrol show job 1581 | grep JobState @@ -329,12 +329,12 @@ $ for id in $(seq 2 6) > done $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1582 p01-arm jobstart opr0019 PD 0:00 8 (Priority) - 1583 p01-arm restart opr0019 PD 0:00 8 (Dependency) - 1584 p01-arm restart opr0019 PD 0:00 8 (Dependency) - 1585 p01-arm restart opr0019 PD 0:00 8 (Dependency) - 1586 p01-arm restart opr0019 PD 0:00 8 (Dependency) - 1587 p01-arm restart opr0019 PD 0:00 8 (Dependency) + 1582 p01-arm jobstart user017 PD 0:00 8 (Priority) + 1583 p01-arm restart user017 PD 0:00 8 (Dependency) + 1584 p01-arm restart user017 PD 0:00 8 (Dependency) + 1585 p01-arm restart user017 PD 0:00 8 (Dependency) + 1586 p01-arm restart user017 PD 0:00 8 (Dependency) + 1587 p01-arm restart user017 PD 0:00 8 (Dependency) $ for id in $(seq 1582 1587) > do > scontrol show job $id | grep JobState @@ -371,13 +371,26 @@ To view information about available job partitions, use the `sinfo` command: ```console $ sinfo -PARTITION AVAIL TIMELIMIT NODES STATE NODELIST -p00-arm up 1-00:00:00 1 idle p00-arm01 -p01-arm* up 1-00:00:00 8 idle p01-arm[01-08] -p02-intel up 1-00:00:00 2 idle p02-intel[01-02] -p03-amd up 1-00:00:00 2 idle p03-amd[01-02] -p04-edge up 1-00:00:00 1 idle p04-edge01 -p05-synt up 1-00:00:00 1 idle p05-synt01 +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +qcpu* up 2-00:00:00 191 idle cn[1-67,69-192] +qcpu_biz up 2-00:00:00 1 alloc cn68 +qcpu_biz up 2-00:00:00 191 idle cn[1-67,69-192] +qcpu_exp up 1:00:00 1 alloc cn68 +qcpu_exp up 1:00:00 191 idle cn[1-67,69-192] +qcpu_free up 18:00:00 1 alloc cn68 +qcpu_free up 18:00:00 191 idle cn[1-67,69-192] +qcpu_long up 6-00:00:00 1 alloc cn68 +qcpu_long up 6-00:00:00 191 idle cn[1-67,69-192] +qcpu_preempt up 12:00:00 1 alloc cn68 +qcpu_preempt up 12:00:00 191 idle cn[1-67,69-192] +qgpu up 2-00:00:00 8 idle cn[193-200] +qgpu_biz up 2-00:00:00 8 idle cn[193-200] +qgpu_exp up 1:00:00 8 idle cn[193-200] +qgpu_free up 18:00:00 8 idle cn[193-200] +qgpu_preempt up 12:00:00 8 idle cn[193-200] +qfat up 2-00:00:00 1 idle cn201 +qdgx up 2-00:00:00 1 idle cn202 +qviz up 8:00:00 2 idle vizserv[1-2] ``` Here we can see output of the `sinfo` command ran on Barbora cluster. By default, it shows basic node and partition configurations. @@ -387,12 +400,12 @@ To view partition summary information, use `sinfo -s`, or `sinfo --summarize`: ```console $ sinfo -s PARTITION AVAIL TIMELIMIT NODES(A/I/O/T) NODELIST -qcpu* up 2-00:00:00 0/192/0/192 cn[1-192] -qcpu_biz up 2-00:00:00 0/192/0/192 cn[1-192] -qcpu_exp up 1:00:00 0/192/0/192 cn[1-192] -qcpu_free up 18:00:00 0/192/0/192 cn[1-192] -qcpu_long up 6-00:00:00 0/192/0/192 cn[1-192] -qcpu_preempt up 12:00:00 0/192/0/192 cn[1-192] +qcpu* up 2-00:00:00 1/191/0/192 cn[1-192] +qcpu_biz up 2-00:00:00 1/191/0/192 cn[1-192] +qcpu_exp up 1:00:00 1/191/0/192 cn[1-192] +qcpu_free up 18:00:00 1/191/0/192 cn[1-192] +qcpu_long up 6-00:00:00 1/191/0/192 cn[1-192] +qcpu_preempt up 12:00:00 1/191/0/192 cn[1-192] qgpu up 2-00:00:00 0/8/0/8 cn[193-200] qgpu_biz up 2-00:00:00 0/8/0/8 cn[193-200] qgpu_exp up 1:00:00 0/8/0/8 cn[193-200] @@ -437,9 +450,9 @@ To view information about queued jobs, use the `squeue` command: ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1556 p01-arm interact opr0019 R 1:07 8 p01-arm[01-08] - 1558 p02-intel interact easybuil CD 0:21 2 p02-intel[01-02] - 1557 p03-amd interact opr0019 R 0:57 1 p03-amd01 + 1556 p01-arm interact user017 R 1:07 8 p01-arm[01-08] + 1558 p02-intel interact user018 CD 0:21 2 p02-intel[01-02] + 1557 p03-amd interact user017 R 0:57 1 p03-amd01 ``` By default, this shows the job ID, partition, name of the job, job owner's username, job state, how long has the job been already running, number of allocated nodes, and a list of allocated nodes. @@ -449,13 +462,13 @@ To view jobs only belonging to a particular user, you can either use `--user=<us ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1559 p01-arm interact opr0019 R 3:37 8 p01-arm[01-08] - 1560 p02-intel interact easybuil R 0:05 2 p02-intel[01-02] - 1557 p03-amd interact opr0019 R 10:22 1 p03-amd01 + 1559 p01-arm interact user017 R 3:37 8 p01-arm[01-08] + 1560 p02-intel interact user018 R 0:05 2 p02-intel[01-02] + 1557 p03-amd interact user017 R 10:22 1 p03-amd01 $ squeue --me JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1559 p01-arm interact opr0019 R 4:04 8 p01-arm[01-08] - 1557 p03-amd interact opr0019 R 10:49 1 p03-amd01 + 1559 p01-arm interact user017 R 4:04 8 p01-arm[01-08] + 1557 p03-amd interact user017 R 10:49 1 p03-amd01 ``` `squeue` also allows for printing information about specific jobs using the `--jobs` flag: @@ -463,8 +476,8 @@ $ squeue --me ```console $ squeue --jobs 1557,1560 JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1560 p02-intel interact easybuil R 2:09 2 p02-intel[01-02] - 1557 p03-amd interact opr0019 R 12:26 1 p03-amd01 + 1560 p02-intel interact user018 R 2:09 2 p02-intel[01-02] + 1557 p03-amd interact user017 R 12:26 1 p03-amd01 ``` For more information about the `squeue` command, its flags, and formatting options, see the manual, either by using the `man squeue` command or [online][f]. @@ -505,15 +518,15 @@ For more information about the `squeue` command, its flags, and formatting optio ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1563 p01-arm interact opr0019 PD 0:00 3 (Resources) - 1562 p01-arm interact opr0019 R 0:20 8 p01-arm[01-08] - 1561 p03-amd interact opr0019 R 0:25 1 p03-amd01 + 1563 p01-arm interact user017 PD 0:00 3 (Resources) + 1562 p01-arm interact user017 R 0:20 8 p01-arm[01-08] + 1561 p03-amd interact user017 R 0:25 1 p03-amd01 $ scancel 1562 $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1562 p01-arm interact opr0019 CA 0:57 8 p01-arm[01-08] - 1563 p01-arm interact opr0019 R 0:03 3 p01-arm[01-03] - 1561 p03-amd interact opr0019 R 1:06 1 p03-amd01 + 1562 p01-arm interact user017 CA 0:57 8 p01-arm[01-08] + 1563 p01-arm interact user017 R 0:03 3 p01-arm[01-03] + 1561 p03-amd interact user017 R 1:06 1 p03-amd01 ``` To cancel multiple jobs, simply list all of their job IDs: @@ -521,19 +534,19 @@ To cancel multiple jobs, simply list all of their job IDs: ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1562 p01-arm interact opr0019 CA 0:57 8 p01-arm[01-08] - 1564 p01-arm interact opr0019 PD 0:00 8 (Resources) - 1563 p01-arm interact opr0019 R 3:31 3 p01-arm[01-03] - 1561 p03-amd interact opr0019 R 4:34 1 p03-amd01 - 1565 p03-amd interact opr0019 R 0:07 1 p03-amd01 + 1562 p01-arm interact user017 CA 0:57 8 p01-arm[01-08] + 1564 p01-arm interact user017 PD 0:00 8 (Resources) + 1563 p01-arm interact user017 R 3:31 3 p01-arm[01-03] + 1561 p03-amd interact user017 R 4:34 1 p03-amd01 + 1565 p03-amd interact user017 R 0:07 1 p03-amd01 $ scancel 1562 1563 1561 1565 1564 $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1562 p01-arm interact opr0019 CA 0:57 8 p01-arm[01-08] - 1563 p01-arm interact opr0019 CA 4:01 3 p01-arm[01-03] - 1564 p01-arm interact opr0019 CA 0:00 8 - 1561 p03-amd interact opr0019 CA 5:04 1 p03-amd01 - 1565 p03-amd interact opr0019 CA 0:37 1 p03-amd01 + 1562 p01-arm interact user017 CA 0:57 8 p01-arm[01-08] + 1563 p01-arm interact user017 CA 4:01 3 p01-arm[01-03] + 1564 p01-arm interact user017 CA 0:00 8 + 1561 p03-amd interact user017 CA 5:04 1 p03-amd01 + 1565 p03-amd interact user017 CA 0:37 1 p03-amd01 ``` Slurm also allows for canceling only jobs which fulfill certain criteria, for example, the partition to which they have been submitted, or their job state (or a mixture of both): @@ -541,30 +554,30 @@ Slurm also allows for canceling only jobs which fulfill certain criteria, for ex ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1569 p01-arm interact opr0019 PD 0:00 3 (Resources) - 1571 p01-arm interact opr0019 PD 0:00 3 (Priority) - 1568 p01-arm interact opr0019 R 1:52 8 p01-arm[01-08] - 1566 p03-amd interact opr0019 R 1:55 1 p03-amd01 - 1567 p03-amd interact opr0019 R 1:53 1 p03-amd01 - 1570 p03-amd interact opr0019 R 0:26 1 p03-amd01 + 1569 p01-arm interact user017 PD 0:00 3 (Resources) + 1571 p01-arm interact user017 PD 0:00 3 (Priority) + 1568 p01-arm interact user017 R 1:52 8 p01-arm[01-08] + 1566 p03-amd interact user017 R 1:55 1 p03-amd01 + 1567 p03-amd interact user017 R 1:53 1 p03-amd01 + 1570 p03-amd interact user017 R 0:26 1 p03-amd01 $ scancel --partition p03-amd $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1569 p01-arm interact opr0019 PD 0:00 3 (Resources) - 1571 p01-arm interact opr0019 PD 0:00 3 (Priority) - 1568 p01-arm interact opr0019 R 3:08 8 p01-arm[01-08] - 1566 p03-amd interact opr0019 CA 2:42 1 p03-amd01 - 1567 p03-amd interact opr0019 CA 2:40 1 p03-amd01 - 1570 p03-amd interact opr0019 CA 1:13 1 p03-amd01 + 1569 p01-arm interact user017 PD 0:00 3 (Resources) + 1571 p01-arm interact user017 PD 0:00 3 (Priority) + 1568 p01-arm interact user017 R 3:08 8 p01-arm[01-08] + 1566 p03-amd interact user017 CA 2:42 1 p03-amd01 + 1567 p03-amd interact user017 CA 2:40 1 p03-amd01 + 1570 p03-amd interact user017 CA 1:13 1 p03-amd01 $ scancel --partition p01-arm --state R $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1568 p01-arm interact opr0019 CA 4:29 8 p01-arm[01-08] - 1569 p01-arm interact opr0019 R 0:07 3 p01-arm[01-03] - 1571 p01-arm interact opr0019 R 0:07 3 p01-arm[04-06] - 1566 p03-amd interact opr0019 CA 2:42 1 p03-amd01 - 1567 p03-amd interact opr0019 CA 2:40 1 p03-amd01 - 1570 p03-amd interact opr0019 CA 1:13 1 p03-amd01 + 1568 p01-arm interact user017 CA 4:29 8 p01-arm[01-08] + 1569 p01-arm interact user017 R 0:07 3 p01-arm[01-03] + 1571 p01-arm interact user017 R 0:07 3 p01-arm[04-06] + 1566 p03-amd interact user017 CA 2:42 1 p03-amd01 + 1567 p03-amd interact user017 CA 2:40 1 p03-amd01 + 1570 p03-amd interact user017 CA 1:13 1 p03-amd01 ``` For more information about the `scancel` command, its flags, and formatting options, see the manual, either by using the `man scancel` command or [online][g]. @@ -580,8 +593,8 @@ To view detailed job information, you can use `scontrol show job <job_id>`, for ```console $ scontrol show job 1571 - UserId=opr0019(5856) GroupId=opr0019(6432) MCS_label=N/A - Priority=4294901692 Nice=0 Account=easybuild QOS=normal + UserId=user017(5856) GroupId=user017(6432) MCS_label=N/A + Priority=4294901692 Nice=0 Account=user018 QOS=normal JobState=RUNNING Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0 DerivedExitCode=0:0 @@ -603,7 +616,7 @@ $ scontrol show job 1571 Features=(null) DelayBoot=00:00:00 OverSubscribe=NO Contiguous=0 Licenses=(null) Network=(null) Command=(null) - WorkDir=/home/opr0019 + WorkDir=/home/user017 Power= ``` @@ -615,24 +628,24 @@ Sometimes you may want to temporarily prevent your job from running. For this re ```console $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1572 p01-arm interact opr0019 PD 0:00 8 (Resources) - 1576 p01-arm interact opr0019 PD 0:00 8 (Priority) - 1569 p01-arm interact opr0019 R 1:14:11 3 p01-arm[01-03] - 1571 p01-arm interact opr0019 R 1:14:11 3 p01-arm[04-06] + 1572 p01-arm interact user017 PD 0:00 8 (Resources) + 1576 p01-arm interact user017 PD 0:00 8 (Priority) + 1569 p01-arm interact user017 R 1:14:11 3 p01-arm[01-03] + 1571 p01-arm interact user017 R 1:14:11 3 p01-arm[04-06] $ scontrol hold 1572 1576 $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1576 p01-arm interact opr0019 PD 0:00 8 (JobHeldUser) - 1572 p01-arm interact opr0019 PD 0:00 8 (JobHeldUser) - 1569 p01-arm interact opr0019 R 1:14:32 3 p01-arm[01-03] - 1571 p01-arm interact opr0019 R 1:14:32 3 p01-arm[04-06] + 1576 p01-arm interact user017 PD 0:00 8 (JobHeldUser) + 1572 p01-arm interact user017 PD 0:00 8 (JobHeldUser) + 1569 p01-arm interact user017 R 1:14:32 3 p01-arm[01-03] + 1571 p01-arm interact user017 R 1:14:32 3 p01-arm[04-06] $ scontrol release 1572 1576 $ squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) - 1572 p01-arm interact opr0019 PD 0:00 8 (Resources) - 1576 p01-arm interact opr0019 PD 0:00 8 (Priority) - 1569 p01-arm interact opr0019 R 1:14:39 3 p01-arm[01-03] - 1571 p01-arm interact opr0019 R 1:14:39 3 p01-arm[04-06] + 1572 p01-arm interact user017 PD 0:00 8 (Resources) + 1576 p01-arm interact user017 PD 0:00 8 (Priority) + 1569 p01-arm interact user017 R 1:14:39 3 p01-arm[01-03] + 1571 p01-arm interact user017 R 1:14:39 3 p01-arm[04-06] ``` `scontrol` also offers an interactive mode, where you can run commands in quick succession: @@ -683,9 +696,9 @@ which would result in 3 output files, by default called `slurm-${SLURM_ARRAY_JOB ```console $ ls -l slurm*.out --rw-rw-r-- 1 opr0019 opr0019 159 Feb 17 14:47 slurm-1632_1.out --rw-rw-r-- 1 opr0019 opr0019 159 Feb 17 14:47 slurm-1632_2.out --rw-rw-r-- 1 opr0019 opr0019 159 Feb 17 14:47 slurm-1632_3.out +-rw-rw-r-- 1 user017 user017 159 Feb 17 14:47 slurm-1632_1.out +-rw-rw-r-- 1 user017 user017 159 Feb 17 14:47 slurm-1632_2.out +-rw-rw-r-- 1 user017 user017 159 Feb 17 14:47 slurm-1632_3.out ``` with the following contents: