diff --git a/README.md b/README.md index 0b91e9d52cf654c7987aa755d69571f68d2a409d..14c55b07229fd90554e5c68c8b8bcc770904666e 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,10 @@ module load rocm/5.2.3.lua ## Pytorch simple test (calling of torch.cuda.device_count()) -### Base environment +### Run an interactive job ``` -module purge -module load CrayEnv -module load PrgEnv-cray/8.3.3 -module load craype-accel-amd-gfx90a -module load cray-python - -# Default ROCm – more recent versions are preferable (e.g. ROCm 5.6.0) -module load rocm/5.2.3.lua +salloc -A project_XXX --partition=standard-g -N 1 -n 1 --gres=gpu:8 -t 01:00:00 ``` ### Scripts diff --git a/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh b/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh index 51ae02e306e7b5160c25a3345480697b1146191b..22dede652bcfeda53195c2ab31566c603702d77f 100755 --- a/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh +++ b/scripts/install/05-install-container-torch2.0.1-rocm5.5.1.sh @@ -14,7 +14,8 @@ wd=$(pwd) set -x # download from https://hub.docker.com/r/rocm/pytorch or use from Samuel Antao: -SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +#SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-d55f9163ed80.sif rm -rf $wd/run-me.sh cat > $wd/run-me.sh << EOF diff --git a/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh b/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh index 00b096e640cf751d26c80ca80a04bca760958d5a..9159f91859d3c93bc09c6b7e959764169416bbc9 100755 --- a/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/01-simple-test-direct-torch1.13.1-rocm5.2.3.sh @@ -28,5 +28,5 @@ if [ ! -d $wd/pip-installs ] ; then fi PYTHONPATH=$wd/pip-installs \ - srun -n1 --gpus 8 \ + srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh b/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh index 1696487dcb6fd641b5cb658d222cafe982740cad..be9bf2e649d3c58420bb7379bcc8cae2219c55c2 100755 --- a/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/02-simple-test-venv-torch1.13.1-rocm5.2.3.sh @@ -29,5 +29,5 @@ else source cray-python-virtualenv/bin/activate fi -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh b/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh index b11da4c47e4cca15dd46dd6f7cc7e62be0ce01d0..56efb63f6c781d51251a8227c391332336bb15d5 100755 --- a/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/03-simple-test-conda-torch1.13.1-rocm5.2.3.sh @@ -28,5 +28,5 @@ else source $wd/miniconda3/bin/activate pytorch fi -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh b/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh index eeb0d30f51369cad71d58e72cf958053e59f767b..03cf58d34dffc7c9b4202ba5b4c82217936c9335 100755 --- a/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh +++ b/scripts/tests/04-simple-test-source-torch1.13.1-rocm5.2.3.sh @@ -31,5 +31,5 @@ fi # Make sure conda libs can be loaded. export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH -srun -n1 --gpus 8 \ +srun -n 1 \ python -c 'import torch; print("I have this many devices:", torch.cuda.device_count())' diff --git a/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh b/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh index d16c020329cacd6a420404f416f1a8df69a0f8e2..c03e8147b8f66c712a02917387209b50f11109d3 100755 --- a/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh +++ b/scripts/tests/05-simple-test-container-torch2.0.1-rocm5.5.1.sh @@ -14,7 +14,8 @@ wd=$(pwd) set -x # download from https://hub.docker.com/r/rocm/pytorch or use from Samuel Antao: -SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +#SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-4305da4654f4.sif +SIF=/pfs/lustrep2/projappl/project_462000125/samantao-public/containers/lumi-pytorch-rocm-5.5.1-python-3.10-pytorch-v2.0.1-dockerhash-d55f9163ed80.sif # check script if [ ! -f $wd/run-me.sh ] ; then @@ -22,7 +23,7 @@ if [ ! -f $wd/run-me.sh ] ; then exit 1 fi -srun -n1 --gpus 8 \ +srun -n 1 \ singularity exec \ -B /var/spool/slurmd:/var/spool/slurmd \ -B /opt/cray:/opt/cray \