Newer
Older
## Introduction
ROCm HIP allows developers to convert [CUDA code][a] to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs.
This page documents the use of pre-built Singularity/apptainer image on Karolina Accelerated nodes (acn).
For the current list of installed versions, use:
```console
module avail apptainer
```
Load the required module:
```console
module load apptainer/1.1.5
```
Run the container:
```console
singularity shell /home/vic0092/rocm/centos7-nvidia-rocm.sif
```
The above gives you Singularity / apptainer shell prompt:
```console
Singularity>
```
Verify that you have GPUs active and accessible on the given node:
```console
nvidia-smi
```
You should get output similar to:
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
```console
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.07 Driver Version: 515.65.07 CUDA Version: 11.7 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100-SXM... Off | 00000000:07:00.0 Off | 0 |
| N/A 26C P0 50W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA A100-SXM... Off | 00000000:0B:00.0 Off | 0 |
| N/A 26C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA A100-SXM... Off | 00000000:48:00.0 Off | 0 |
| N/A 22C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA A100-SXM... Off | 00000000:4C:00.0 Off | 0 |
| N/A 25C P0 52W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 4 NVIDIA A100-SXM... Off | 00000000:88:00.0 Off | 0 |
| N/A 22C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 5 NVIDIA A100-SXM... Off | 00000000:8B:00.0 Off | 0 |
| N/A 26C P0 54W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 6 NVIDIA A100-SXM... Off | 00000000:C8:00.0 Off | 0 |
| N/A 25C P0 52W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 7 NVIDIA A100-SXM... Off | 00000000:CB:00.0 Off | 0 |
| N/A 26C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
```
### Code Example
In this section, we show a basic code example. You can directly copy and paste the code to test it:
```cpp
// filename : /tmp/sample.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define CHECK(cmd) \
{\
cudaError_t error = cmd;\
if (error != cudaSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
/*
* Square each element in the array A and write to array C.
*/
template <typename T>
__global__ void
vector_square(T *C_d, T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
}
}
int main(int argc, char *argv[])
{
float *A_d, *C_d;
float *A_h, *C_h;
size_t N = 1000000;
size_t Nbytes = N * sizeof(float);
cudaDeviceProp props;
CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
A_h = (float*)malloc(Nbytes);
CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
C_h = (float*)malloc(Nbytes);
CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
// Fill with Phi + i
for (size_t i=0; i<N; i++)
{
A_h[i] = 1.618f + i;
}
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
CHECK(cudaMalloc(&A_d, Nbytes));
CHECK(cudaMalloc(&C_d, Nbytes));
printf ("info: copy Host2Device\n");
CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
const unsigned blocks = 512;
const unsigned threadsPerBlock = 256;
printf ("info: launch 'vector_square' kernel\n");
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
printf ("info: copy Device2Host\n");
CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
printf ("info: check result\n");
for (size_t i=0; i<N; i++) {
if (C_h[i] != A_h[i] * A_h[i]) {
CHECK(cudaErrorUnknown);
}
}
printf ("PASSED!\n");
}
```
First convert the CUDA sample code into HIP code:
```console
cd /tmp
/opt/rocm/hip/bin/hipify-perl sample.cu > sample.cpp
```
This code can then be compiled using the following commands:
```console
cd /tmp
export HIP_PLATFORM=$( /opt/rocm/hip/bin/hipconfig --platform )
export HIPCC=/opt/rocm/hip/bin/hipcc
$HIPCC sample.cpp -o sample
```
Running it, you should get the following output:
```console
Singularity> cd /tmp
Singularity> ./sample
info: running on device NVIDIA A100-SXM4-40GB
info: allocate host mem ( 7.63 MB)
info: allocate device mem ( 7.63 MB)
info: copy Host2Device
info: launch 'vector_square' kernel
info: copy Device2Host
info: check result
PASSED!
```
[a]: nvidia-cuda.md