diff --git a/docs.it4i/cs/amd.md b/docs.it4i/cs/amd.md new file mode 100644 index 0000000000000000000000000000000000000000..0cc12f3934c632359b6bbc25c5dfaeaa16c4ad9c --- /dev/null +++ b/docs.it4i/cs/amd.md @@ -0,0 +1,456 @@ +# Using AMD Partition + +For testing your application on the AMD partition, +you need to prepare a job script for that partition or use the interactive job: + +``` +salloc -N 1 -c 64 -A PROJECT-ID -p p03-amd --gres=gpu:4 --time=08:00:00 +``` +where: +- -N 1 means allocating one server, +- -c 64 means allocation 64 cores, +- -A is your project, +- -p p03-amd is AMD partition, +- --gres=gpu:4 means allcating all 4 GPUs of the node, +- --time=08:00:00 means allocation for 8 hours. + +You have also an option to allocate subset of the resources only, by reducing the -c and --gres=gpu to smaller values. + +``` +salloc -N 1 -c 48 -A PROJECT-ID -p p03-amd --gres=gpu:3 --time=08:00:00 +salloc -N 1 -c 32 -A PROJECT-ID -p p03-amd --gres=gpu:2 --time=08:00:00 +salloc -N 1 -c 16 -A PROJECT-ID -p p03-amd --gres=gpu:1 --time=08:00:00 +``` + +### Note: + +p03-amd01 server has hyperthreading enabled therefore htop shows 128 cores. + +p03-amd02 server has hyperthreading dissabled therefore htop shows 64 cores. + + +## Using AMD MI100 GPUs + +The AMD GPUs can be programmed using the ROCm open-source platform (see: https://docs.amd.com/ for more information.) + +ROCm and related libraries are installed directly in the system. You can find it here: +``` +/opt/rocm/ +``` +The actual version can be found here: +``` +[user@p03-amd02.cs]$ cat /opt/rocm/.info/version + +5.5.1-74 +``` + +## Basic HIP code + +The first way how to program AMD GPUs is to use HIP. + +The basic vector addition code in HIP looks like this. This a full code and you can copy and paste it into a file. For this example we use `vector_add.hip.cpp` . + +``` +#include <cstdio> +#include <hip/hip_runtime.h> + + + +__global__ void add_vectors(float * x, float * y, float alpha, int count) +{ + long long idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(idx < count) + y[idx] += alpha * x[idx]; +} + +int main() +{ + // number of elements in the vectors + long long count = 10; + + // allocation and initialization of data on the host (CPU memory) + float * h_x = new float[count]; + float * h_y = new float[count]; + for(long long i = 0; i < count; i++) + { + h_x[i] = i; + h_y[i] = 10 * i; + } + + // print the input data + printf("X:"); + for(long long i = 0; i < count; i++) + printf(" %7.2f", h_x[i]); + printf("\n"); + printf("Y:"); + for(long long i = 0; i < count; i++) + printf(" %7.2f", h_y[i]); + printf("\n"); + + // allocation of memory on the GPU device + float * d_x; + float * d_y; + hipMalloc(&d_x, count * sizeof(float)); + hipMalloc(&d_y, count * sizeof(float)); + + // copy the data from host memory to the device + hipMemcpy(d_x, h_x, count * sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(d_y, h_y, count * sizeof(float), hipMemcpyHostToDevice); + + int tpb = 256; + int bpg = (count - 1) / tpb + 1; + // launch the kernel on the GPU + add_vectors<<< bpg, tpb >>>(d_x, d_y, 100, count); + // hipLaunchKernelGGL(add_vectors, bpg, tpb, 0, 0, d_x, d_y, 100, count); + + // copy the result back to CPU memory + hipMemcpy(h_y, d_y, count * sizeof(float), hipMemcpyDeviceToHost); + + // print the results + printf("Y:"); + for(long long i = 0; i < count; i++) + printf(" %7.2f", h_y[i]); + printf("\n"); + + // free the allocated memory + hipFree(d_x); + hipFree(d_y); + delete[] h_x; + delete[] h_y; + + return 0; +} +``` + +To compile the code we use `hipcc` compiler. The compiler information can be found like this: + +```` +[user@p03-amd02.cs ~]$ hipcc --version + +HIP version: 5.5.30202-eaf00c0b +AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.5.1 23194 69ef12a7c3cc5b0ccf820bc007bd87e8b3ac3037) +Target: x86_64-unknown-linux-gnu +Thread model: posix +InstalledDir: /opt/rocm-5.5.1/llvm/bin +```` + +The code is compiled a follows: + +``` +hipcc vector_add.hip.cpp -o vector_add.x +``` + +The correct output of the code is: +``` +[user@p03-amd02.cs ~]$ ./vector_add.x +X: 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 +Y: 0.00 10.00 20.00 30.00 40.00 50.00 60.00 70.00 80.00 90.00 +Y: 0.00 110.00 220.00 330.00 440.00 550.00 660.00 770.00 880.00 990.00 +``` + +## HIP and ROCm libraries + +The list of official AMD libraries can be found here: https://docs.amd.com/category/libraries. + + + +The libraries are installed in the same directory is ROCm +``` +/opt/rocm/ +``` + +Following libraries are installed: +``` +drwxr-xr-x 4 root root 44 Jun 7 14:09 hipblas +drwxr-xr-x 3 root root 17 Jun 7 14:09 hipblas-clients +drwxr-xr-x 3 root root 29 Jun 7 14:09 hipcub +drwxr-xr-x 4 root root 44 Jun 7 14:09 hipfft +drwxr-xr-x 3 root root 25 Jun 7 14:09 hipfort +drwxr-xr-x 4 root root 32 Jun 7 14:09 hiprand +drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsolver +drwxr-xr-x 4 root root 44 Jun 7 14:09 hipsparse +``` + +and + +``` +drwxr-xr-x 4 root root 32 Jun 7 14:09 rocalution +drwxr-xr-x 4 root root 44 Jun 7 14:09 rocblas +drwxr-xr-x 4 root root 44 Jun 7 14:09 rocfft +drwxr-xr-x 4 root root 32 Jun 7 14:09 rocprim +drwxr-xr-x 4 root root 32 Jun 7 14:09 rocrand +drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsolver +drwxr-xr-x 4 root root 44 Jun 7 14:09 rocsparse +drwxr-xr-x 3 root root 29 Jun 7 14:09 rocthrust +``` + + + +### Using hipBlas library + +The basic code in HIP that uses hipBlas looks like this. This a full code and you can copy and paste it into a file. For this example we use `hipblas.hip.cpp` . + +``` +#include <cstdio> +#include <vector> +#include <cstdlib> +#include <hip/hip_runtime.h> +#include <hipblas/hipblas.h> + + +int main() +{ + srand(9600); + + int width = 10; + int height = 7; + int elem_count = width * height; + + + // initialization of data in CPU memory + + float * h_A; + hipHostMalloc(&h_A, elem_count * sizeof(*h_A)); + for(int i = 0; i < elem_count; i++) + h_A[i] = (100.0f * rand()) / (float)RAND_MAX; + printf("Matrix A:\n"); + for(int r = 0; r < height; r++) + { + for(int c = 0; c < width; c++) + printf("%6.3f ", h_A[r + height * c]); + printf("\n"); + } + + float * h_x; + hipHostMalloc(&h_x, width * sizeof(*h_x)); + for(int i = 0; i < width; i++) + h_x[i] = (100.0f * rand()) / (float)RAND_MAX; + printf("vector x:\n"); + for(int i = 0; i < width; i++) + printf("%6.3f ", h_x[i]); + printf("\n"); + + float * h_y; + hipHostMalloc(&h_y, height * sizeof(*h_y)); + for(int i = 0; i < height; i++) + h_x[i] = 100.0f + i; + printf("vector y:\n"); + for(int i = 0; i < height; i++) + printf("%6.3f ", h_x[i]); + printf("\n"); + + + // initialization of data in GPU memory + + float * d_A; + size_t pitch_A; + hipMallocPitch((void**)&d_A, &pitch_A, height * sizeof(*d_A), width); + hipMemcpy2D(d_A, pitch_A, h_A, height * sizeof(*d_A), height * sizeof(*d_A), width, hipMemcpyHostToDevice); + int lda = pitch_A / sizeof(float); + + float * d_x; + hipMalloc(&d_x, width * sizeof(*d_x)); + hipMemcpy(d_x, h_x, width * sizeof(*d_x), hipMemcpyHostToDevice); + + float * d_y; + hipMalloc(&d_y, height * sizeof(*d_y)); + hipMemcpy(d_y, h_y, height * sizeof(*d_y), hipMemcpyHostToDevice); + + + // basic calculation of the result on the CPU + + float alpha=2.0f, beta=10.0f; + + for(int i = 0; i < height; i++) + h_y[i] *= beta; + for(int r = 0; r < height; r++) + for(int c = 0; c < width; c++) + h_y[r] += alpha * h_x[c] * h_A[r + height * c]; + printf("result y CPU:\n"); + for(int i = 0; i < height; i++) + printf("%6.3f ", h_y[i]); + printf("\n"); + + + // calculation of the result on the GPU using the hipBLAS library + + hipblasHandle_t blas_handle; + hipblasCreate(&blas_handle); + + hipblasSgemv(blas_handle, HIPBLAS_OP_N, height, width, &alpha, d_A, lda, d_x, 1, &beta, d_y, 1); + hipDeviceSynchronize(); + + hipblasDestroy(blas_handle); + + + // copy the GPU result to CPU memory and print it + hipMemcpy(h_y, d_y, height * sizeof(*d_y), hipMemcpyDeviceToHost); + printf("result y BLAS:\n"); + for(int i = 0; i < height; i++) + printf("%6.3f ", h_y[i]); + printf("\n"); + + + // free all the allocated memory + hipFree(d_A); + hipFree(d_x); + hipFree(d_y); + hipHostFree(h_A); + hipHostFree(h_x); + hipHostFree(h_y); + + return 0; +} +``` + +The code compilation can be done as follows: +``` +hipcc hipblas.hip.cpp -o hipblas.x -lhipblas +``` + +### Using hipSolver library + +The basic code in HIP that uses hipSolver looks like this. This a full code and you can copy and paste it into a file. For this example we use `hipsolver.hip.cpp` . + +``` +#include <cstdio> +#include <vector> +#include <cstdlib> +#include <algorithm> +#include <hipsolver/hipsolver.h> +#include <hipblas/hipblas.h> + +int main() +{ + srand(63456); + + int size = 10; + + + // allocation and initialization of data on host. this time we use std::vector + + int h_A_ld = size; + int h_A_pitch = h_A_ld * sizeof(float); + std::vector<float> h_A(size * h_A_ld); + for(int r = 0; r < size; r++) + for(int c = 0; c < size; c++) + h_A[r * h_A_ld + c] = (10.0 * rand()) / RAND_MAX; + printf("System matrix A:\n"); + for(int r = 0; r < size; r++) + { + for(int c = 0; c < size; c++) + printf("%6.3f ", h_A[r * h_A_ld + c]); + printf("\n"); + } + + std::vector<float> h_b(size); + for(int i = 0; i < size; i++) + h_b[i] = (10.0 * rand()) / RAND_MAX; + printf("RHS vector b:\n"); + for(int i = 0; i < size; i++) + printf("%6.3f ", h_b[i]); + printf("\n"); + + std::vector<float> h_x(size); + + + // memory allocation on the device and initialization + + float * d_A; + size_t d_A_pitch; + hipMallocPitch((void**)&d_A, &d_A_pitch, size, size); + int d_A_ld = d_A_pitch / sizeof(float); + + float * d_b; + hipMalloc(&d_b, size * sizeof(float)); + + float * d_x; + hipMalloc(&d_x, size * sizeof(float)); + + int * d_piv; + hipMalloc(&d_piv, size * sizeof(int)); + + int * info; + hipMallocManaged(&info, sizeof(int)); + + hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice); + hipMemcpy(d_b, h_b.data(), size * sizeof(float), hipMemcpyHostToDevice); + + + // solving the system using hipSOLVER + + hipsolverHandle_t solverHandle; + hipsolverCreate(&solverHandle); + + int wss_trf, wss_trs; // wss = WorkSpace Size + hipsolverSgetrf_bufferSize(solverHandle, size, size, d_A, d_A_ld, &wss_trf); + hipsolverSgetrs_bufferSize(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, &wss_trs); + float * workspace; + int wss = std::max(wss_trf, wss_trs); + hipMalloc(&workspace, wss * sizeof(float)); + + hipsolverSgetrf(solverHandle, size, size, d_A, d_A_ld, workspace, wss, d_piv, info); + hipsolverSgetrs(solverHandle, HIPSOLVER_OP_N, size, 1, d_A, d_A_ld, d_piv, d_b, size, workspace, wss, info); + + hipMemcpy(d_x, d_b, size * sizeof(float), hipMemcpyDeviceToDevice); + hipMemcpy(h_x.data(), d_x, size * sizeof(float), hipMemcpyDeviceToHost); + printf("Solution vector x:\n"); + for(int i = 0; i < size; i++) + printf("%6.3f ", h_x[i]); + printf("\n"); + + hipFree(workspace); + + hipsolverDestroy(solverHandle); + + + // perform matrix-vector multiplication A*x using hipBLAS to check if the solution is correct + + hipblasHandle_t blasHandle; + hipblasCreate(&blasHandle); + + float alpha = 1; + float beta = 0; + hipMemcpy2D(d_A, d_A_pitch, h_A.data(), h_A_pitch, size * sizeof(float), size, hipMemcpyHostToDevice); + hipblasSgemv(blasHandle, HIPBLAS_OP_N, size, size, &alpha, d_A, d_A_ld, d_x, 1, &beta, d_b, 1); + hipDeviceSynchronize(); + + hipblasDestroy(blasHandle); + + for(int i = 0; i < size; i++) + h_b[i] = 0; + hipMemcpy(h_b.data(), d_b, size * sizeof(float), hipMemcpyDeviceToHost); + printf("Check multiplication vector Ax:\n"); + for(int i = 0; i < size; i++) + printf("%6.3f ", h_b[i]); + printf("\n"); + + + // free all the allocated memory + + hipFree(info); + hipFree(d_piv); + hipFree(d_x); + hipFree(d_b); + hipFree(d_A); + + return 0; +} +``` + +The code compilation can be done as follows: +``` +hipcc hipsolver.hip.cpp -o hipsolver.x -lhipblas -lhipsolver +``` + +### Other AMD libraries and frameworks + + + + + +Please see [gcc options](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html) for more advanced compilation settings. +No complications are expected as long as the application does not use any intrinsic for `x64` architecture. +If you want to use intrinsic, +[SVE](https://developer.arm.com/documentation/102699/0100/Optimizing-with-intrinsics) instruction set is available. \ No newline at end of file