Upload New File

c6ddeed1 · Jan Siwiec · b346793e · c6ddeed1
Commit c6ddeed1 authored 2 years ago by Jan Siwiec
--- a/docs.it4i/cs/amd.md
+++ b/docs.it4i/cs/amd.md
+# Using AMD Partition
+
+For testing your application on the AMD partition,
+you need to prepare a job script for that partition or use the interactive job:
+
+```
+salloc -N 1 -c 64 -A PROJECT-ID -p p03-amd --gres=gpu:4 --time=08:00:00
+```
+where: 
+- -N 1 means allocating one server, 
+- -c 64 means allocation 64 cores,  
+- -A is your project, 
+- -p p03-amd is AMD partition, 
+- --gres=gpu:4 means allcating all 4 GPUs of the node,
+- --time=08:00:00 means allocation for 8 hours.  
+
+You have also an option to allocate subset of the resources only, by reducing the -c and --gres=gpu to smaller values. 
+
+```
+salloc -N 1 -c 48 -A PROJECT-ID -p p03-amd --gres=gpu:3 --time=08:00:00
+salloc -N 1 -c 32 -A PROJECT-ID -p p03-amd --gres=gpu:2 --time=08:00:00
+salloc -N 1 -c 16 -A PROJECT-ID -p p03-amd --gres=gpu:1 --time=08:00:00
+```
+
+### Note: 
+
+p03-amd01 server has hyperthreading enabled therefore htop shows 128 cores.
+
+p03-amd02 server has hyperthreading dissabled therefore htop shows 64 cores.
+
+
+## Using AMD MI100 GPUs
+
+The AMD GPUs can be programmed using the ROCm open-source platform (see: https://docs.amd.com/ for more information.)
+
+ROCm and related libraries are installed directly in the system. You can find it here: 
+```
+/opt/rocm/
+```
+The actual version can be found here: 
+```
+[user@p03-amd02.cs]$ cat /opt/rocm/.info/version
+
+5.5.1-74
+```
+
+## Basic HIP code
+
+The first way how to program AMD GPUs is to use HIP. 
+
+The basic vector addition code in HIP looks like this. This a full code and you can copy and paste it into a file. For this example we use `vector_add.hip.cpp` .  
+
+```
+#include <cstdio>
+#include <hip/hip_runtime.h>
+
+
+
+__global__ void add_vectors(float * x, float * y, float alpha, int count)
+{
+    long long idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(idx < count)
+        y[idx] += alpha * x[idx];
+}
+
+int main()
+{
+    // number of elements in the vectors
+    long long count = 10;
+
+    // allocation and initialization of data on the host (CPU memory)
+    float * h_x = new float[count];
+    float * h_y = new float[count];
+    for(long long i = 0; i < count; i++)
+    {
+        h_x[i] = i;
+        h_y[i] = 10 * i;
+    }
+
+    // print the input data
+    printf("X:");
+    for(long long i = 0; i < count; i++)
+        printf(" %7.2f", h_x[i]);
+    printf("\n");
+    printf("Y:");
+    for(long long i = 0; i < count; i++)
+        printf(" %7.2f", h_y[i]);
+    printf("\n");
+    
+    // allocation of memory on the GPU device
+    float * d_x;
+    float * d_y;
+    hipMalloc(&d_x, count * sizeof(float));
+    hipMalloc(&d_y, count * sizeof(float));
+
+    // copy the data from host memory to the device
+    hipMemcpy(d_x, h_x, count * sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(d_y, h_y, count * sizeof(float), hipMemcpyHostToDevice);
+
+    int tpb = 256;
+    int bpg = (count - 1) / tpb + 1;
+    // launch the kernel on the GPU
+    add_vectors<<< bpg, tpb >>>(d_x, d_y, 100, count);
+    // hipLaunchKernelGGL(add_vectors, bpg, tpb, 0, 0, d_x, d_y, 100, count);
+
+    // copy the result back to CPU memory
+    hipMemcpy(h_y, d_y, count * sizeof(float), hipMemcpyDeviceToHost);
+
+    // print the results
+    printf("Y:");
+    for(long long i = 0; i < count; i++)
+        printf(" %7.2f", h_y[i]);
+    printf("\n");
+
+    // free the allocated memory
+    hipFree(d_x);
+    hipFree(d_y);
+    delete[] h_x;
+    delete[] h_y;
+
+    return 0;
+}
+```
+
+To compile the code we use `hipcc` compiler. The compiler information can be found like this: 
+
+````
+[user@p03-amd02.cs ~]$ hipcc --version 
+
+HIP version: 5.5.30202-eaf00c0b
+AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.5.1 23194 69ef12a7c3cc5b0ccf820bc007bd87e8b3ac3037)
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm-5.5.1/llvm/bin
+````
+
+The code is compiled a follows: 
+
+```
+hipcc vector_add.hip.cpp -o vector_add.x
+```
+
+The correct output of the code is: 
+```
+[user@p03-amd02.cs ~]$ ./vector_add.x 
+X:    0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00
+Y:    0.00   10.00   20.00   30.00   40.00   50.00   60.00   70.00   80.00   90.00
+Y:    0.00  110.00  220.00  330.00  440.00  550.00  660.00  770.00  880.00  990.00
+```
+
+## HIP and ROCm libraries
+
+The list of official AMD libraries can be found here: https://docs.amd.com/category/libraries. 
+
+
+
+The libraries are installed in the same directory is ROCm 
+```
+/opt/rocm/
+```
+
+Following libraries are installed: 
+```
+drwxr-xr-x  4 root root   44 Jun  7 14:09 hipblas
+drwxr-xr-x  3 root root   17 Jun  7 14:09 hipblas-clients
+drwxr-xr-x  3 root root   29 Jun  7 14:09 hipcub
+drwxr-xr-x  4 root root   44 Jun  7 14:09 hipfft
+drwxr-xr-x  3 root root   25 Jun  7 14:09 hipfort
+drwxr-xr-x  4 root root   32 Jun  7 14:09 hiprand
+drwxr-xr-x  4 root root   44 Jun  7 14:09 hipsolver
+drwxr-xr-x  4 root root   44 Jun  7 14:09 hipsparse
+```
+
+and 
+
+```
+drwxr-xr-x  4 root root   32 Jun  7 14:09 rocalution
+drwxr-xr-x  4 root root   44 Jun  7 14:09 rocblas
+drwxr-xr-x  4 root root   44 Jun  7 14:09 rocfft
+drwxr-xr-x  4 root root   32 Jun  7 14:09 rocprim
+drwxr-xr-x  4 root root   32 Jun  7 14:09 rocrand
+drwxr-xr-x  4 root root   44 Jun  7 14:09 rocsolver
+drwxr-xr-x  4 root root   44 Jun  7 14:09 rocsparse
+drwxr-xr-x  3 root root   29 Jun  7 14:09 rocthrust
+```
+
+
+
+### Using hipBlas library
+
+The basic code in HIP that uses hipBlas looks like this. This a full code and you can copy and paste it into a file. For this example we use `hipblas.hip.cpp` .  
+
+```
+#include <cstdio>
+#include <vector>
+#include <cstdlib>
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+
+
+int main()
+{    
+    srand(9600);
+
+    int width = 10;
+    int height = 7;
+    int elem_count = width * height;
+
+
+    // initialization of data in CPU memory
+
+    float * h_A;
+    hipHostMalloc(&h_A, elem_count * sizeof(*h_A));
+    for(int i = 0; i < elem_count; i++)
+        h_A[i] = (100.0f * rand()) / (float)RAND_MAX;
+    printf("Matrix A:\n");
+    for(int r = 0; r < height; r++)
+    {
+        for(int c = 0; c < width; c++)
+            printf("%6.3f  ", h_A[r + height * c]);
+        printf("\n");
+    }
+
+    float * h_x;
+    hipHostMalloc(&h_x, width * sizeof(*h_x));
+    for(int i = 0; i < width; i++)
+        h_x[i] = (100.0f * rand()) / (float)RAND_MAX;
+    printf("vector x:\n");
+    for(int i = 0; i < width; i++)
+        printf("%6.3f  ", h_x[i]);
+    printf("\n");
+    
+    float * h_y;
+    hipHostMalloc(&h_y, height * sizeof(*h_y));
+    for(int i = 0; i < height; i++)
+        h_x[i] = 100.0f + i;
+    printf("vector y:\n");
+    for(int i = 0; i < height; i++)
+        printf("%6.3f  ", h_x[i]);
+    printf("\n");
+
+    
+    // initialization of data in GPU memory
+
+    float * d_A;
+    size_t pitch_A;
+    hipMallocPitch((void**)&d_A, &pitch_A, height * sizeof(*d_A), width);
+    hipMemcpy2D(d_A, pitch_A, h_A, height * sizeof(*d_A), height * sizeof(*d_A), width, hipMemcpyHostToDevice);
+    int lda = pitch_A / sizeof(float);
+
+    float * d_x;
+    hipMalloc(&d_x, width * sizeof(*d_x));
+    hipMemcpy(d_x, h_x, width * sizeof(*d_x), hipMemcpyHostToDevice);
+    
+    float * d_y;
+    hipMalloc(&d_y, height * sizeof(*d_y));
+    hipMemcpy(d_y, h_y, height * sizeof(*d_y), hipMemcpyHostToDevice);
+
+
+    // basic calculation of the result on the CPU
+
+    float alpha=2.0f, beta=10.0f;
+
+    for(int i = 0; i < height; i++)
+        h_y[i] *= beta;
+    for(int r = 0; r < height; r++)
+        for(int c = 0; c < width; c++)
+            h_y[r] += alpha * h_x[c] * h_A[r + height * c];
+    printf("result y CPU:\n");
+    for(int i = 0; i < height; i++)
+        printf("%6.3f  ", h_y[i]);
+    printf("\n");
+    
+    
+    // calculation of the result on the GPU using the hipBLAS library
+
+    hipblasHandle_t blas_handle;
+    hipblasCreate(&blas_handle);
+
+    hipblasSgemv(blas_handle, HIPBLAS_OP_N, height, width, &alpha, d_A, lda, d_x, 1, &beta, d_y, 1);
+    hipDeviceSynchronize();
+
+    hipblasDestroy(blas_handle);
+    
+
+    // copy the GPU result to CPU memory and print it
+    hipMemcpy(h_y, d_y, height * sizeof(*d_y), hipMemcpyDeviceToHost);
+    printf("result y BLAS:\n");
+    for(int i = 0; i < height; i++)
+        printf("%6.3f  ", h_y[i]);
+    printf("\n");
+
+
+    // free all the allocated memory
+    hipFree(d_A);
+    hipFree(d_x);
+    hipFree(d_y);
+    hipHostFree(h_A);
+    hipHostFree(h_x);
+    hipHostFree(h_y);
+
+    return 0;
+}
+```
+
+The code compilation can be done as follows: 
+```
+hipcc hipblas.hip.cpp -o hipblas.x -lhipblas
+```
+
+### Using hipSolver library
+
+The basic code in HIP that uses hipSolver looks like this. This a full code and you can copy and paste it into a file. For this example we use `hipsolver.hip.cpp` .  
+
+```
+#include <cstdio>
+#include <vector>
+#include <cstdlib>
+#include <algorithm>
+#include <hipsolver/hipsolver.h>
+#include <hipblas/hipblas.h>
+
+int main()
+{
+    srand(63456);
+
+    int size = 10;
+
+
+    // allocation and initialization of data on host. this time we use std::vector
+
+    int h_A_ld = size;
+    int h_A_pitch = h_A_ld * sizeof(float);
+    std::vector<float> h_A(size * h_A_ld);
+    for(int r = 0; r < size; r++)
+        for(int c = 0; c < size; c++)
+            h_A[r * h_A_ld + c] = (10.0 * rand()) / RAND_MAX;
+    printf("System matrix A:\n");
+    for(int r = 0; r < size; r++)
+    {
+        for(int c = 0; c < size; c++)
+            printf("%6.3f  ", h_A[r * h_A_ld + c]);
+        printf("\n");
+    }    
+    
+    std::vector<float> h_b(size);
+    for(int i = 0; i < size; i++)
+        h_b[i] = (10.0 * rand()) / RAND_MAX;
+    printf("RHS vector b:\n");
+    for(int i = 0; i < size; i++)
+        printf("%6.3f  ", h_b[i]);
+    printf("\n");
+
+    std::vector<float> h_x(size);
+
+
+    // memory allocation on the device and initialization
+
+    float * d_A;
+    size_t d_A_pitch;
+    hipMallocPitch((void**)&d_A, &d_A_pitch, size, size);
+    int d_A_ld = d_A_pitch / sizeof(float);
+
+    float * d_b;
+    hipMalloc(&d_b, size * sizeof(float));
+    
+    float * d_x;
+    hipMalloc(&d_x, size * sizeof(float));
+
+    int * d_piv;
+    hipMalloc(&d_piv, size * sizeof(int));
+
+    int * info;
+    hipMallocManaged(&info, sizeof(int));
+
+    hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
+    hipMemcpy(d_b, h_b.data(), size * sizeof(float), hipMemcpyHostToDevice);
+    
+
+    // solving the system using hipSOLVER
+
+    hipsolverHandle_t solverHandle;
+    hipsolverCreate(&solverHandle);
+
+    int wss_trf, wss_trs; // wss = WorkSpace Size
+    hipsolverSgetrf_bufferSize(solverHandle, size, size, d_A, d_A_ld, &wss_trf);
+    hipsolverSgetrs_bufferSize(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, &wss_trs);
+    float * workspace;
+    int wss = std::max(wss_trf, wss_trs);
+    hipMalloc(&workspace, wss * sizeof(float));
+    
+    hipsolverSgetrf(solverHandle, size, size, d_A, d_A_ld, workspace, wss, d_piv, info);
+    hipsolverSgetrs(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, workspace, wss, info);
+
+    hipMemcpy(d_x, d_b, size * sizeof(float), hipMemcpyDeviceToDevice);
+    hipMemcpy(h_x.data(), d_x, size * sizeof(float), hipMemcpyDeviceToHost);
+    printf("Solution vector x:\n");
+    for(int i = 0; i < size; i++)
+        printf("%6.3f  ", h_x[i]);
+    printf("\n");
+
+    hipFree(workspace);
+
+    hipsolverDestroy(solverHandle);
+
+
+    // perform matrix-vector multiplication A*x using hipBLAS to check if the solution is correct
+
+    hipblasHandle_t blasHandle;
+    hipblasCreate(&blasHandle);
+
+    float alpha = 1;
+    float beta = 0;
+    hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice);
+    hipblasSgemv(blasHandle, HIPBLAS_OP_N, size, size, &alpha, d_A, d_A_ld, d_x, 1, &beta, d_b, 1);
+    hipDeviceSynchronize();
+
+    hipblasDestroy(blasHandle);
+
+    for(int i = 0; i < size; i++)
+        h_b[i] = 0;
+    hipMemcpy(h_b.data(), d_b, size * sizeof(float), hipMemcpyDeviceToHost);
+    printf("Check multiplication vector Ax:\n");
+    for(int i = 0; i < size; i++)
+        printf("%6.3f  ", h_b[i]);
+    printf("\n");
+
+
+    // free all the allocated memory
+
+    hipFree(info);
+    hipFree(d_piv);
+    hipFree(d_x);
+    hipFree(d_b);
+    hipFree(d_A);
+
+    return 0;
+}
+```
+
+The code compilation can be done as follows: 
+```
+hipcc hipsolver.hip.cpp -o hipsolver.x -lhipblas -lhipsolver
+```
+
+### Other AMD libraries and frameworks 
+
+
+
+
+
+Please see [gcc options](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html) for more advanced compilation settings.
+No complications are expected as long as the application does not use any intrinsic for `x64` architecture.
+If you want to use intrinsic,
+[SVE](https://developer.arm.com/documentation/102699/0100/Optimizing-with-intrinsics) instruction set is available.
\ No newline at end of file