From aaa80a7eb1a8968389cf81fbce1000f3e92e3b5e Mon Sep 17 00:00:00 2001 From: Jan Siwiec <jan.siwiec@vsb.cz> Date: Tue, 17 Jan 2023 07:11:47 +0100 Subject: [PATCH] Upload New File --- docs.it4i/software/nvidia-hip.md | 205 +++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 docs.it4i/software/nvidia-hip.md diff --git a/docs.it4i/software/nvidia-hip.md b/docs.it4i/software/nvidia-hip.md new file mode 100644 index 000000000..5c1f76ac1 --- /dev/null +++ b/docs.it4i/software/nvidia-hip.md @@ -0,0 +1,205 @@ +# NVIDIA CUDA + +## Introduction + +ROCm HIP allows developers to convert [CUDA code][a] to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs. + +## This page documents the use of pre-built Singularity / apptainer image on karolina Accelerated nodes (acnXX) + +## Installed Versions of Singularity / apptainer + +For the current list of installed versions, use: + +```console +module avail apptainer +``` + +Load the required module: + +```console +module load apptainer/1.1.5 +``` + +Run the container: +```console +singularity shell /home/vic0092/rocm/centos7-nvidia-rocm.sif +``` + +The above gives you Singularity / apptainer shell prompt: +```console +Singularity> +``` + +Verify that you have GPUs active and accessible on the given node: +```console +nvidia-smi +``` + +You should get output similar to: +```console ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 515.65.07 Driver Version: 515.65.07 CUDA Version: 11.7 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 NVIDIA A100-SXM... Off | 00000000:07:00.0 Off | 0 | +| N/A 26C P0 50W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 1 NVIDIA A100-SXM... Off | 00000000:0B:00.0 Off | 0 | +| N/A 26C P0 51W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 2 NVIDIA A100-SXM... Off | 00000000:48:00.0 Off | 0 | +| N/A 22C P0 51W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 3 NVIDIA A100-SXM... Off | 00000000:4C:00.0 Off | 0 | +| N/A 25C P0 52W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 4 NVIDIA A100-SXM... Off | 00000000:88:00.0 Off | 0 | +| N/A 22C P0 51W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 5 NVIDIA A100-SXM... Off | 00000000:8B:00.0 Off | 0 | +| N/A 26C P0 54W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 6 NVIDIA A100-SXM... Off | 00000000:C8:00.0 Off | 0 | +| N/A 25C P0 52W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ +| 7 NVIDIA A100-SXM... Off | 00000000:CB:00.0 Off | 0 | +| N/A 26C P0 51W / 400W | 0MiB / 40960MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +``` + + +### Code Example + +In this section, we show a basic code example. You can directly copy and paste the code to test it: + +```cpp +// filename : /tmp/sample.cu + +#include <stdio.h> +#include <cuda_runtime.h> + +#define CHECK(cmd) \ +{\ + cudaError_t error = cmd;\ + if (error != cudaSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + + +/* + * Square each element in the array A and write to array C. + */ +template <typename T> +__global__ void +vector_square(T *C_d, T *A_d, size_t N) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + for (size_t i=offset; i<N; i+=stride) { + C_d[i] = A_d[i] * A_d[i]; + } +} + + +int main(int argc, char *argv[]) +{ + float *A_d, *C_d; + float *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + + cudaDeviceProp props; + CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/)); + printf ("info: running on device %s\n", props.name); + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + A_h = (float*)malloc(Nbytes); + CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess ); + C_h = (float*)malloc(Nbytes); + CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess ); + // Fill with Phi + i + for (size_t i=0; i<N; i++) + { + A_h[i] = 1.618f + i; + } + + printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + CHECK(cudaMalloc(&A_d, Nbytes)); + CHECK(cudaMalloc(&C_d, Nbytes)); + + + printf ("info: copy Host2Device\n"); + CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + + printf ("info: launch 'vector_square' kernel\n"); + vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N); + + printf ("info: copy Device2Host\n"); + CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost)); + + printf ("info: check result\n"); + for (size_t i=0; i<N; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + CHECK(cudaErrorUnknown); + } + } + printf ("PASSED!\n"); +} +``` + +First convert the CUDA sample code into HIP code: +```console +cd /tmp +/opt/rocm/hip/bin/hipify-perl sample.cu > sample.cpp +``` + +This code can then be compiled using the following commands: +```console +cd /tmp +export HIP_PLATFORM=$( /opt/rocm/hip/bin/hipconfig --platform ) +export HIPCC=/opt/rocm/hip/bin/hipcc +$HIPCC sample.cpp -o sample +``` + +Running it, you should get the following output: +```console +Singularity> cd /tmp +Singularity> ./sample +info: running on device NVIDIA A100-SXM4-40GB +info: allocate host mem ( 7.63 MB) +info: allocate device mem ( 7.63 MB) +info: copy Host2Device +info: launch 'vector_square' kernel +info: copy Device2Host +info: check result +PASSED! +``` + +That's all folks! + +[a]: nvidia-cuda.md -- GitLab