From ae247fdafe72c21662c8035abd635130f8754b93 Mon Sep 17 00:00:00 2001 From: milan jaros <milan.jaros@vsb.cz> Date: Wed, 31 Jan 2024 12:58:18 +0200 Subject: [PATCH] fix srun in scripts, fix path to singularity --- README.md | 11 ++--------- .../05-install-container-torch2.0.1-rocm5.5.1.sh | 3 ++- .../01-simple-test-direct-torch1.13.1-rocm5.2.3.sh | 2 +- .../02-simple-test-venv-torch1.13.1-rocm5.2.3.sh | 2 +- .../03-simple-test-conda-torch1.13.1-rocm5.2.3.sh | 2 +- .../04-simple-test-source-torch1.13.1-rocm5.2.3.sh | 2 +- .../05-simple-test-container-torch2.0.1-rocm5.5.1.sh | 5 +++-- 7 files changed, 11 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 0b91e9d..14c55b0 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,10 @@ module load rocm/5.2.3.lua ## Pytorch simple test (calling of torch.cuda.device_count()) -### Base environment +### Run an interactive job ``` -module purge -module load CrayEnv -module load PrgEnv-cray/8.3.3 -module load craype-accel-amd-gfx90a -module load cray-python - -# Default ROCm – more recent versions are preferable (e.g. ROCm 5.6.0) -module load rocm/5.2.3.lua +salloc -A project_XXX --partition=standard-g -N 1 -n 1 --gres=gpu:8 -t 01:00:00 ``` ### Scripts diff --git a/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh b/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh index 51ae02e..22dede6 100755 --- a/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh +++ b/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh @@ -14,7 +14,8 @@ wd=$(pwd) set -x # download from https://hub.docker.com/r/rocm/pytorch or use from Samuel Antao: -SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +#SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-d55f9163ed80.sif rm -rf $wd/run-me.sh cat > $wd/run-me.sh << EOF diff --git a/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh b/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh index 00b096e..9159f91 100755 --- a/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh @@ -28,5 +28,5 @@ if [ ! -d $wd/pip-installs ] ; then fi PYTHONPATH=$wd/pip-installs \ - srun -n1 --gpus 8 \ + srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh b/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh index 1696487..be9bf2e 100755 --- a/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh @@ -29,5 +29,5 @@ else source cray-python-virtualenv/bin/activate fi -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh b/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh index b11da4c..56efb63 100755 --- a/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh @@ -28,5 +28,5 @@ else source $wd/miniconda3/bin/activate pytorch fi -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh b/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh index eeb0d30..03cf58d 100755 --- a/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh @@ -31,5 +31,5 @@ fi # Make sure conda libs can be loaded. export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh b/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh index d16c020..c03e814 100755 --- a/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh +++ b/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh @@ -14,7 +14,8 @@ wd=$(pwd) set -x # download from https://hub.docker.com/r/rocm/pytorch or use from Samuel Antao: -SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +#SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-d55f9163ed80.sif # check script if [ ! -f $wd/run-me.sh ] ; then @@ -22,7 +23,7 @@ if [ ! -f $wd/run-me.sh ] ; then exit 1 fi -srun -n1 --gpus 8 \ +srun -n 1 \ singularity exec \ -B /var/spool/slurmd:/var/spool/slurmd \ -B /opt/cray:/opt/cray \ -- GitLab