Newer
Older
//by Branislav Jansik, @IT4Innovations, 2024
//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* Cycle measurement not implemented for ARM NEON yet
static __inline__ unsigned long long int get_cycles(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
return 1;
}
*/
int main(int argc, char **argv) {
double *ar, *br;
double rsum, rmax, rmin;
double ainc;
double to, to0;
int i, size, nsve=16, niter = 1;
//inputs
if (argc>1) niter = atoi(argv[1]);
//find out threads and SVE vector length
size=omp_get_max_threads();
//print info
printf("FLIPS: Double Precision NEON (%dbit) FMLA Instructions Per Second\nRun %d times on %d cores\n", 8*nsve, niter, size);
// allocate
posix_memalign((void *) &ar, (size_t)64, (size_t) sizeof(double)*2*nsve);
posix_memalign((void *) &br, (size_t)64, (size_t) sizeof(double)*2*nsve);
// initialize c constants within the mandelbrot set
ainc = 0.7/((double) 2*nsve);
for (i=0;i<2*nsve; i++)
ar[i] = 0.0 + ainc*i;
// iterations
for (i=1; i<=niter; i++) {
rsum=0.0;
rmax=-INFINITY;
rmin=INFINITY;
//fqmax=-INFINITY;
//fqmin=INFINITY;
to0 = omp_get_wtime();
#pragma omp parallel default(shared) reduction(+:rsum) reduction(max:rmax) reduction (min:rmin)
{
double t0,t,flips;
//cycle measurement not implemented for ARM yet
//unsigned long long int c0, c;
#pragma omp barrier
t0 = omp_get_wtime();
//c0 = get_cycles();
__asm__(
//init
"ldr q0, [%0, 0]\n\t"
"ldr q2, [%0, 16]\n\t"
"ldr q4, [%0, 32]\n\t"
"ldr q6, [%0, 48]\n\t"
"ldr q8, [%0, 64]\n\t"
"ldr q10, [%0, 80]\n\t"
"ldr q12, [%0, 96]\n\t"
"ldr q14, [%0, 112]\n\t"
"ldr q16, [%0, 128]\n\t"
"ldr q18, [%0, 144]\n\t"
"ldr q20, [%0, 160]\n\t"
"ldr q22, [%0, 176]\n\t"
"ldr q24, [%0, 192]\n\t"
"ldr q26, [%0, 208]\n\t"
"ldr q28, [%0, 224]\n\t"
"ldr q30, [%0, 240]\n\t"
//zero out registers
"eor v1.16b, v1.16b, v1.16b\n\t"
"eor v3.16b, v3.16b, v3.16b\n\t"
"eor v5.16b, v5.16b, v5.16b\n\t"
"eor v7.16b, v7.16b, v7.16b\n\t"
"eor v9.16b, v9.16b, v9.16b\n\t"
"eor v11.16b, v11.16b, v11.16b\n\t"
"eor v13.16b, v13.16b, v13.16b\n\t"
"eor v15.16b, v15.16b, v15.16b\n\t"
"eor v17.16b, v17.16b, v17.16b\n\t"
"eor v19.16b, v19.16b, v19.16b\n\t"
"eor v21.16b, v21.16b, v21.16b\n\t"
"eor v23.16b, v23.16b, v23.16b\n\t"
"eor v25.16b, v25.16b, v25.16b\n\t"
"eor v27.16b, v27.16b, v27.16b\n\t"
"eor v29.16b, v29.16b, v29.16b\n\t"
"eor v31.16b, v31.16b, v31.16b\n\t"
//zero out loop counter
"mov x0, #0\n\t"
//set loop limit to 16000000 = 4000*4000
"mov x1, #4000\n\t"
"mul x1, x1, x1\n\t"
//loop
"label:\n\t"
"add x0, x0, #1\n\t"
//jansik iterations
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
//C += Z*Z;
"fmla v0.2d, v1.2d, v1.2d\n\t"
"fmla v2.2d, v3.2d, v3.2d\n\t"
"fmla v4.2d, v5.2d, v5.2d\n\t"
"fmla v6.2d, v7.2d, v7.2d\n\t"
"fmla v8.2d, v9.2d, v9.2d\n\t"
"fmla v10.2d, v11.2d, v11.2d\n\t"
"fmla v12.2d, v13.2d, v13.2d\n\t"
"fmla v14.2d, v15.2d, v15.2d\n\t"
"fmla v16.2d, v17.2d, v17.2d\n\t"
"fmla v18.2d, v19.2d, v19.2d\n\t"
"fmla v20.2d, v21.2d, v21.2d\n\t"
"fmla v22.2d, v23.2d, v23.2d\n\t"
"fmla v24.2d, v25.2d, v25.2d\n\t"
"fmla v26.2d, v27.2d, v27.2d\n\t"
"fmla v28.2d, v29.2d, v29.2d\n\t"
"fmla v30.2d, v31.2d, v31.2d\n\t"
//Z += C*C;
"fmla v1.2d, v0.2d, v0.2d\n\t"
"fmla v3.2d, v2.2d, v2.2d\n\t"
"fmla v5.2d, v4.2d, v4.2d\n\t"
"fmla v7.2d, v6.2d, v6.2d\n\t"
"fmla v9.2d, v8.2d, v8.2d\n\t"
"fmla v11.2d, v10.2d, v10.2d\n\t"
"fmla v13.2d, v12.2d, v12.2d\n\t"
"fmla v15.2d, v14.2d, v14.2d\n\t"
"fmla v17.2d, v16.2d, v16.2d\n\t"
"fmla v19.2d, v18.2d, v18.2d\n\t"
"fmla v21.2d, v20.2d, v20.2d\n\t"
"fmla v23.2d, v21.2d, v21.2d\n\t"
"fmla v25.2d, v24.2d, v24.2d\n\t"
"fmla v27.2d, v26.2d, v26.2d\n\t"
"fmla v29.2d, v28.2d, v28.2d\n\t"
"fmla v31.2d, v30.2d, v30.2d\n\t"
//C-= Z*Z;
"fmls v0.2d, v1.2d, v1.2d\n\t"
"fmls v2.2d, v3.2d, v3.2d\n\t"
"fmls v4.2d, v5.2d, v5.2d\n\t"
"fmls v6.2d, v7.2d, v7.2d\n\t"
"fmls v8.2d, v9.2d, v9.2d\n\t"
"fmls v10.2d, v11.2d, v11.2d\n\t"
"fmls v12.2d, v13.2d, v13.2d\n\t"
"fmls v14.2d, v15.2d, v15.2d\n\t"
"fmls v16.2d, v17.2d, v17.2d\n\t"
"fmls v18.2d, v19.2d, v19.2d\n\t"
"fmls v20.2d, v21.2d, v21.2d\n\t"
"fmls v22.2d, v23.2d, v23.2d\n\t"
"fmls v24.2d, v25.2d, v25.2d\n\t"
"fmls v26.2d, v27.2d, v27.2d\n\t"
"fmls v28.2d, v29.2d, v29.2d\n\t"
"fmls v30.2d, v31.2d, v31.2d\n\t"
//Z-= C*C;
"fmls v1.2d, v0.2d, v0.2d\n\t"
"fmls v3.2d, v2.2d, v2.2d\n\t"
"fmls v5.2d, v4.2d, v4.2d\n\t"
"fmls v7.2d, v6.2d, v6.2d\n\t"
"fmls v9.2d, v8.2d, v8.2d\n\t"
"fmls v11.2d, v10.2d, v10.2d\n\t"
"fmls v13.2d, v12.2d, v12.2d\n\t"
"fmls v15.2d, v14.2d, v14.2d\n\t"
"fmls v17.2d, v16.2d, v16.2d\n\t"
"fmls v19.2d, v18.2d, v18.2d\n\t"
"fmls v21.2d, v20.2d, v20.2d\n\t"
"fmls v23.2d, v21.2d, v21.2d\n\t"
"fmls v25.2d, v24.2d, v24.2d\n\t"
"fmls v27.2d, v26.2d, v26.2d\n\t"
"fmls v29.2d, v28.2d, v28.2d\n\t"
"fmls v31.2d, v30.2d, v30.2d\n\t"
)
)
//end loop
"cmp x0, x1\n\t"
"b.lt label \n\t"
//offload
//offload to br array from multiple threads creates race condition
//we ignore it for now.
"str q1, [%1, 0]\n\t"
"str q3, [%1, 16]\n\t"
"str q5, [%1, 32]\n\t"
"str q7, [%1, 48]\n\t"
"str q9, [%1, 64]\n\t"
"str q11, [%1, 80]\n\t"
"str q13, [%1, 96]\n\t"
"str q15, [%1, 112]\n\t"
"str q17, [%1, 128]\n\t"
"str q19, [%1, 144]\n\t"
"str q21, [%1, 160]\n\t"
"str q23, [%1, 176]\n\t"
"str q25, [%1, 192]\n\t"
"str q27, [%1, 208]\n\t"
"str q29, [%1, 224]\n\t"
"str q31, [%1, 240]\n\t"
//inputs, outputs and clobbers
: : "r" (ar) , "r" (br) :
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
"q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
"q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31",
"x0", "x1",
"memory");
//c = get_cycles() - c0;
t = omp_get_wtime() - t0;
//count flips: 16000000*100*64/1e9 = 25.6
flips = 25.6/t;
//freq = c/t/1e9;
rsum=flips;
rmax=flips;
rmin=flips;
//fqmax=freq;
//fqmin=freq;
}
to = omp_get_wtime() - to0;
printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG GrandSum: %6.4fG\n",
i, rmin, rsum/size, rmax, rsum, 25.6*size/to);
}
printf("Outputs:\n");
for(i=0; i<2*nsve; i+=4)
printf("zr: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
return 0;
}