Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void simple_add(const float *a, const float *b, float *c){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
asm volatile("global_load_dword v10, %0, off;\n\t" //load first operand
"global_load_dword v11, %1, off;\n\t" //load second operand
"s_waitcnt vmcnt(0);\n\t" //wait for load finish
"v_add_f32 v12, v11, v10;\n\t" //perform the addition
"global_store_dword %2, v12, off;\n\t" :: "v"(a+idx), "v"(b+idx), "v"(c+idx) : "v10", "v11", "v12", "memory"); //save the result
}
int main(){
int block_num = 4;
int block_size = 16;
float *a, *b, *c;
hipHostMalloc((void **)&a, block_num*block_size*sizeof(float));
hipHostMalloc((void **)&b, block_num*block_size*sizeof(float));
hipHostMalloc((void **)&c, block_num*block_size*sizeof(float));
float x_0 = 1.0f;
for(int i = 0; i < block_num * block_size; i++)
a[i] = x_0 + i * 0.1f;
float y_0 = 2.5f;
for(int i = 0; i < block_num * block_size; i++)
b[i] = y_0 - i * 0.2f;
hipLaunchKernelGGL(simple_add, dim3(block_num), dim3(block_size), 0, 0, a,b,c);
hipDeviceSynchronize();
printf("Operand matrix 1:\n");
for(int i = 0; i < block_num; i++){
for(int j = 0; j < block_size; j++){
printf("%.3f ", a[j+i*block_size]);
}
printf("\n");
}
printf("Operand matrix 2:\n");
for(int i = 0; i < block_num; i++){
for(int j = 0; j < block_size; j++){
printf("%.3f ", b[j+i*block_size]);
}
printf("\n");
}
printf("Result matrix:\n");
for(int i = 0; i < block_num; i++){
for(int j = 0; j < block_size; j++){
printf("%.3f ", c[j+i*block_size]);
}
printf("\n");
}
hipHostFree(a);
hipHostFree(b);
hipHostFree(c);
}