From aaa80a7eb1a8968389cf81fbce1000f3e92e3b5e Mon Sep 17 00:00:00 2001
From: Jan Siwiec <jan.siwiec@vsb.cz>
Date: Tue, 17 Jan 2023 07:11:47 +0100
Subject: [PATCH] Upload New File

---
 docs.it4i/software/nvidia-hip.md | 205 +++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 docs.it4i/software/nvidia-hip.md

diff --git a/docs.it4i/software/nvidia-hip.md b/docs.it4i/software/nvidia-hip.md
new file mode 100644
index 000000000..5c1f76ac1
--- /dev/null
+++ b/docs.it4i/software/nvidia-hip.md
@@ -0,0 +1,205 @@
+# NVIDIA CUDA
+
+## Introduction
+
+ROCm HIP allows developers to convert [CUDA code][a] to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs.
+
+## This page documents the use of pre-built Singularity / apptainer image on karolina Accelerated nodes (acnXX)
+
+## Installed Versions of Singularity / apptainer
+
+For the current list of installed versions, use:
+
+```console
+module avail apptainer
+```
+
+Load the required module:
+
+```console
+module load apptainer/1.1.5
+```
+
+Run the container:
+```console
+singularity  shell  /home/vic0092/rocm/centos7-nvidia-rocm.sif
+```
+
+The above gives you Singularity / apptainer shell prompt:
+```console
+Singularity> 
+```
+
+Verify that you have GPUs active and accessible on the given node:
+```console
+nvidia-smi
+```
+
+You should get output similar to:
+```console
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 515.65.07    Driver Version: 515.65.07    CUDA Version: 11.7     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  NVIDIA A100-SXM...  Off  | 00000000:07:00.0 Off |                    0 |
+| N/A   26C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   1  NVIDIA A100-SXM...  Off  | 00000000:0B:00.0 Off |                    0 |
+| N/A   26C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   2  NVIDIA A100-SXM...  Off  | 00000000:48:00.0 Off |                    0 |
+| N/A   22C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   3  NVIDIA A100-SXM...  Off  | 00000000:4C:00.0 Off |                    0 |
+| N/A   25C    P0    52W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   4  NVIDIA A100-SXM...  Off  | 00000000:88:00.0 Off |                    0 |
+| N/A   22C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   5  NVIDIA A100-SXM...  Off  | 00000000:8B:00.0 Off |                    0 |
+| N/A   26C    P0    54W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   6  NVIDIA A100-SXM...  Off  | 00000000:C8:00.0 Off |                    0 |
+| N/A   25C    P0    52W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+|   7  NVIDIA A100-SXM...  Off  | 00000000:CB:00.0 Off |                    0 |
+| N/A   26C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
+|                               |                      |             Disabled |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|  No running processes found                                                 |
++-----------------------------------------------------------------------------+
+```
+
+
+### Code Example
+
+In this section, we show a basic code example. You can directly copy and paste the code to test it:
+
+```cpp
+// filename : /tmp/sample.cu
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define CHECK(cmd) \
+{\
+    cudaError_t error  = cmd;\
+    if (error != cudaSuccess) { \
+        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
+        exit(EXIT_FAILURE);\
+          }\
+}
+
+
+/* 
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+vector_square(T *C_d, T *A_d, size_t N)
+{
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;
+
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    float *A_h, *C_h;
+    size_t N = 1000000;
+    size_t Nbytes = N * sizeof(float);
+
+    cudaDeviceProp props;
+    CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
+    printf ("info: running on device %s\n", props.name);
+
+    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    A_h = (float*)malloc(Nbytes);
+    CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    C_h = (float*)malloc(Nbytes);
+    CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++) 
+    {
+        A_h[i] = 1.618f + i; 
+    }
+
+    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(cudaMalloc(&A_d, Nbytes));
+    CHECK(cudaMalloc(&C_d, Nbytes));
+
+
+    printf ("info: copy Host2Device\n");
+    CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+
+    printf ("info: launch 'vector_square' kernel\n");
+    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
+
+    printf ("info: copy Device2Host\n");
+    CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
+
+    printf ("info: check result\n");
+    for (size_t i=0; i<N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            CHECK(cudaErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
+}
+```
+
+First convert the CUDA sample code into HIP code:
+```console
+cd /tmp
+/opt/rocm/hip/bin/hipify-perl sample.cu > sample.cpp
+```
+
+This code can then be compiled using the following commands:
+```console
+cd /tmp
+export HIP_PLATFORM=$( /opt/rocm/hip/bin/hipconfig --platform )
+export HIPCC=/opt/rocm/hip/bin/hipcc
+$HIPCC sample.cpp -o sample
+```
+
+Running it, you should get the following output:
+```console
+Singularity> cd /tmp
+Singularity> ./sample
+info: running on device NVIDIA A100-SXM4-40GB
+info: allocate host mem (  7.63 MB)
+info: allocate device mem (  7.63 MB)
+info: copy Host2Device
+info: launch 'vector_square' kernel
+info: copy Device2Host
+info: check result
+PASSED!
+```
+
+That's all folks!
+
+[a]: nvidia-cuda.md
-- 
GitLab